diff options
Diffstat (limited to 'gallery_dl')
27 files changed, 833 insertions, 222 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 7504fa4..b64fa2f 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -7,7 +7,6 @@ # published by the Free Software Foundation. import sys -import json import logging from . import version, config, option, output, extractor, job, util, exception @@ -32,81 +31,6 @@ def progress(urls, pformat): yield pinfo["url"] -def parse_inputfile(file, log): - """Filter and process strings from an input file. - - Lines starting with '#' and empty lines will be ignored. - Lines starting with '-' will be interpreted as a key-value pair separated - by an '='. where 'key' is a dot-separated option name and 'value' is a - JSON-parsable value. These configuration options will be applied while - processing the next URL. - Lines starting with '-G' are the same as above, except these options will - be applied for *all* following URLs, i.e. they are Global. - Everything else will be used as a potential URL. - - Example input file: - - # settings global options - -G base-directory = "/tmp/" - -G skip = false - - # setting local options for the next URL - -filename="spaces_are_optional.jpg" - -skip = true - - https://example.org/ - - # next URL uses default filename and 'skip' is false. - https://example.com/index.htm # comment1 - https://example.com/404.htm # comment2 - """ - gconf = [] - lconf = [] - - for line in file: - line = line.strip() - - if not line or line[0] == "#": - # empty line or comment - continue - - elif line[0] == "-": - # config spec - if len(line) >= 2 and line[1] == "G": - conf = gconf - line = line[2:] - else: - conf = lconf - line = line[1:] - - key, sep, value = line.partition("=") - if not sep: - log.warning("input file: invalid <key>=<value> pair: %s", line) - continue - - try: - value = json.loads(value.strip()) - except ValueError as exc: - log.warning("input file: unable to parse '%s': %s", value, exc) - continue - - key = key.strip().split(".") - conf.append((key[:-1], key[-1], value)) - - else: - # url - if " #" in line: - line = line.partition(" #")[0].rstrip() - elif "\t#" in line: - line = line.partition("\t#")[0].rstrip() - if gconf or lconf: - yield util.ExtendedUrl(line, gconf, lconf) - gconf = [] - lconf = [] - else: - yield line - - def main(): try: if sys.stdout and sys.stdout.encoding.lower() != "utf-8": @@ -275,12 +199,12 @@ def main(): try: if inputfile == "-": if sys.stdin: - urls += parse_inputfile(sys.stdin, log) + urls += util.parse_inputfile(sys.stdin, log) else: log.warning("input file: stdin is not readable") else: with open(inputfile, encoding="utf-8") as file: - urls += parse_inputfile(file, log) + urls += util.parse_inputfile(file, log) except OSError as exc: log.warning("input file: %s", exc) diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py new file mode 100644 index 0000000..8fffeb0 --- /dev/null +++ b/gallery_dl/extractor/2chen.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2chen.moe/""" + +from .common import Extractor, Message +from .. import text + + +class _2chenThreadExtractor(Extractor): + """Extractor for 2chen threads""" + category = "2chen" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{time} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{hash}" + root = "https://2chen.moe" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" + test = ( + ("https://2chen.moe/jp/303786", { + "count": ">= 10", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + yield Message.Directory, data + for post in self.posts(page): + if not post["url"]: + continue + post.update(data) + post["url"] = self.root + post["url"] + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + board, pos = text.extract(page, 'class="board">/', '/<') + title = text.extract(page, "<h3>", "</h3>", pos)[0] + return { + "board" : board, + "thread": self.thread, + "title" : text.unescape(title), + } + + def posts(self, page): + """Return iterable with relevant posts""" + return map(self.parse, text.extract_iter( + page, 'class="glass media', '</article>')) + + def parse(self, post): + extr = text.extract_from(post) + return { + "name" : text.unescape(extr("<span>", "</span>")), + "date" : text.parse_datetime( + extr("<time", "<").partition(">")[2], + "%d %b %Y (%a) %H:%M:%S" + ), + "no" : extr('href="#p', '"'), + "url" : extr('</span><a href="', '"'), + "filename": text.unescape(extr('download="', '"')), + "hash" : extr('data-hash="', '"'), + } + + +class _2chenBoardExtractor(Extractor): + """Extractor for 2chen boards""" + category = "2chen" + subcategory = "board" + root = "https://2chen.moe" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog)?/?$" + test = ( + ("https://2chen.moe/co/", { + "pattern": _2chenThreadExtractor.pattern + }), + ("https://2chen.moe/co"), + ("https://2chen.moe/co/catalog") + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + url = "{}/{}/catalog".format(self.root, self.board) + page = self.request(url).text + data = {"_extractor": _2chenThreadExtractor} + for thread in text.extract_iter( + page, '<figure><a href="', '"'): + yield Message.Queue, self.root + thread, data diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py new file mode 100644 index 0000000..1e020c2 --- /dev/null +++ b/gallery_dl/extractor/8chan.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://8chan.moe/""" + +from .common import Extractor, Message +from .. import text +from ..cache import memcache +from datetime import datetime, timedelta +import itertools + +BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)" + + +class _8chanExtractor(Extractor): + """Base class for 8chan extractors""" + category = "8chan" + root = "https://8chan.moe" + + def __init__(self, match): + self.root = "https://8chan." + match.group(1) + Extractor.__init__(self, match) + + @memcache() + def _prepare_cookies(self): + # fetch captcha cookies + # (necessary to download without getting interrupted) + now = datetime.utcnow() + url = self.root + "/captcha.js" + params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")} + self.request(url, params=params).content + + # adjust cookies + # - remove 'expires' timestamp + # - move 'captchaexpiration' value forward by 1 month) + domain = self.root.rpartition("/")[2] + for cookie in self.session.cookies: + if cookie.domain.endswith(domain): + cookie.expires = None + if cookie.name == "captchaexpiration": + cookie.value = (now + timedelta(30, 300)).strftime( + "%a, %d %b %Y %H:%M:%S GMT") + + return self.session.cookies + + +class _8chanThreadExtractor(_8chanExtractor): + """Extractor for 8chan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{boardUri}", + "{threadId} {subject[:50]}") + filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}" + archive_fmt = "{boardUri}_{postId}_{num}" + pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + test = ( + ("https://8chan.moe/vhs/res/4.html", { + "pattern": r"https://8chan\.moe/\.media/[0-9a-f]{64}\.\w+$", + "count": 14, + "keyword": { + "archived": False, + "autoSage": False, + "boardDescription": "Film and Cinema", + "boardMarkdown": None, + "boardName": "Movies", + "boardUri": "vhs", + "creation": r"re:\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}Z", + "cyclic": False, + "email": None, + "id": "re:^[0-9a-f]{6}$", + "locked": False, + "markdown": str, + "maxFileCount": 5, + "maxFileSize": "32.00 MB", + "maxMessageLength": 8001, + "message": str, + "mime": str, + "name": "Anonymous", + "num": int, + "originalName": str, + "path": r"re:/.media/[0-9a-f]{64}\.\w+$", + "pinned": False, + "postId": int, + "signedRole": None, + "size": int, + "threadId": 4, + "thumb": r"re:/.media/t_[0-9a-f]{64}$", + "uniquePosters": 9, + "usesCustomCss": True, + "usesCustomJs": False, + "wsPort": 8880, + "wssPort": 2087, + }, + }), + ("https://8chan.se/vhs/res/4.html"), + ("https://8chan.cc/vhs/res/4.html"), + ) + + def __init__(self, match): + _8chanExtractor.__init__(self, match) + _, self.board, self.thread = match.groups() + + def items(self): + # fetch thread data + url = "{}/{}/res/{}.".format(self.root, self.board, self.thread) + self.session.headers["Referer"] = url + "html" + thread = self.request(url + "json").json() + thread["postId"] = thread["threadId"] + thread["_http_headers"] = {"Referer": url + "html"} + + try: + self.session.cookies = self._prepare_cookies() + except Exception as exc: + self.log.debug("Failed to fetch captcha cookies: %s: %s", + exc.__class__.__name__, exc, exc_info=True) + + # download files + posts = thread.pop("posts", ()) + yield Message.Directory, thread + for post in itertools.chain((thread,), posts): + files = post.pop("files", ()) + if not files: + continue + thread.update(post) + for num, file in enumerate(files): + file.update(thread) + file["num"] = num + text.nameext_from_url(file["originalName"], file) + yield Message.Url, self.root + file["path"], file + + +class _8chanBoardExtractor(_8chanExtractor): + """Extractor for 8chan boards""" + subcategory = "board" + pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$" + test = ( + ("https://8chan.moe/vhs/"), + ("https://8chan.moe/vhs/2.html", { + "pattern": _8chanThreadExtractor.pattern, + "count": 23, + }), + ("https://8chan.se/vhs/"), + ("https://8chan.cc/vhs/"), + ) + + def __init__(self, match): + _8chanExtractor.__init__(self, match) + _, self.board, self.page = match.groups() + self.session.headers["Referer"] = self.root + "/" + + def items(self): + page = text.parse_int(self.page, 1) + url = "{}/{}/{}.json".format(self.root, self.board, page) + board = self.request(url).json() + threads = board["threads"] + + while True: + for thread in threads: + thread["_extractor"] = _8chanThreadExtractor + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["threadId"]) + yield Message.Queue, url, thread + + page += 1 + if page > board["pageCount"]: + return + url = "{}/{}/{}.json".format(self.root, self.board, page) + threads = self.request(url).json()["threads"] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index fed6998..851f660 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,11 +10,13 @@ import re modules = [ "2chan", + "2chen", "35photo", "3dbooru", "420chan", "4chan", "500px", + "8chan", "8kun", "8muses", "adultempire", @@ -90,6 +92,7 @@ modules = [ "mememuseum", "myhentaigallery", "myportfolio", + "nana", "naver", "naverwebtoon", "newgrounds", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 62626a1..14d1e6b 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -76,7 +76,12 @@ class ArtstationExtractor(Extractor): def get_project_assets(self, project_id): """Return all assets associated with 'project_id'""" url = "{}/projects/{}.json".format(self.root, project_id) - data = self.request(url).json() + + try: + data = self.request(url).json() + except exception.HttpError as exc: + self.log.warning(exc) + return data["title"] = text.unescape(data["title"]) data["description"] = text.unescape(text.remove_html( @@ -406,6 +411,10 @@ class ArtstationImageExtractor(ArtstationExtractor): "options": (("external", True),), "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0", }), + # 404 (#3016) + ("https://www.artstation.com/artwork/3q3mXB", { + "count": 0, + }), # alternate URL patterns ("https://sungchoi.artstation.com/projects/LQVJr"), ("https://artstn.co/p/LQVJr"), @@ -419,7 +428,10 @@ class ArtstationImageExtractor(ArtstationExtractor): def metadata(self): self.assets = list(ArtstationExtractor.get_project_assets( self, self.project_id)) - self.user = self.assets[0]["user"]["username"] + try: + self.user = self.assets[0]["user"]["username"] + except IndexError: + self.user = "" return ArtstationExtractor.metadata(self) def projects(self): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f7ee51f..e304717 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -122,8 +122,7 @@ class Extractor(): seconds = (self._interval() - (time.time() - Extractor.request_timestamp)) if seconds > 0.0: - self.log.debug("Sleeping for %.5s seconds", seconds) - time.sleep(seconds) + self.sleep(seconds, "request") while True: try: @@ -169,8 +168,9 @@ class Extractor(): self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - time.sleep( - max(tries, self._interval()) if self._interval else tries) + self.sleep( + max(tries, self._interval()) if self._interval else tries, + "retry") tries += 1 raise exception.HttpError(msg, response) @@ -202,6 +202,11 @@ class Extractor(): self.log.info("Waiting until %s for %s.", isotime, reason) time.sleep(seconds) + def sleep(self, seconds, reason): + self.log.debug("Sleeping %.2f seconds (%s)", + seconds, reason) + time.sleep(seconds) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 8c2ed53..c455ce1 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -88,10 +88,7 @@ class DanbooruExtractor(BaseExtractor): if post["extension"] == "zip": if self.ugoira: - post["frames"] = self.request( - "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format( - self.root, post["id"]) - ).json()["pixiv_ugoira_frame_data"]["data"] + post["frames"] = self._ugoira_frames(post) post["_http_adjust_extension"] = False else: url = post["large_file_url"] @@ -139,6 +136,18 @@ class DanbooruExtractor(BaseExtractor): else: return + def _ugoira_frames(self, post): + data = self.request("{}/posts/{}.json?only=media_metadata".format( + self.root, post["id"]) + ).json()["media_metadata"]["metadata"] + + ext = data["ZIP:ZipFileName"].rpartition(".")[2] + print(post["id"], ext) + fmt = ("{:>06}." + ext).format + delays = data["Ugoira:FrameDelays"] + return [{"file": fmt(index), "delay": delay} + for index, delay in enumerate(delays)] + INSTANCES = { "danbooru": { diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 6897476..cb2aa24 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -72,7 +72,7 @@ class DeviantartExtractor(Extractor): def items(self): self.api = DeviantartOAuthAPI(self) - if self.user: + if self.user and self.config("group", True): profile = self.api.user_profile(self.user) self.group = not profile if self.group: @@ -938,11 +938,11 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def deviations(self): url = "{}/{}/{}/{}".format( self.root, self.user, self.type, self.deviation_id) - appurl = text.extract(self._limited_request(url).text, - 'property="da:appurl" content="', '"')[0] - if not appurl: + uuid = text.extract(self._limited_request(url).text, + '"deviationUuid\\":\\"', '\\')[0] + if not uuid: raise exception.NotFoundError("deviation") - return (self.api.deviation(appurl.rpartition("/")[2]),) + return (self.api.deviation(uuid),) class DeviantartScrapsExtractor(DeviantartExtractor): diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 6ddf2ec..8b90250 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,7 +19,7 @@ class DirectlinkExtractor(Extractor): archive_fmt = filename_fmt pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\." r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" - r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$") + r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$") test = ( (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), { "url": "18c5d00077332e98e53be9fed2ee4be66154b88d", @@ -31,9 +31,9 @@ class DirectlinkExtractor(Extractor): "keyword": "29dad729c40fb09349f83edafa498dba1297464a", }), # more complex example - ("https://example.org/path/to/file.webm?que=1&ry=2#fragment", { - "url": "114b8f1415cc224b0f26488ccd4c2e7ce9136622", - "keyword": "06014abd503e3b2b58aa286f9bdcefdd2ae336c0", + ("https://example.org/path/to/file.webm?que=1?&ry=2/#fragment", { + "url": "6fb1061390f8aada3db01cb24b51797c7ee42b31", + "keyword": "3d7abc31d45ba324e59bc599c3b4862452d5f29c", }), # percent-encoded characters ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", { diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 8481248..f692a90 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -68,6 +68,16 @@ class FanboxExtractor(Extractor): post["html"] = content_body["html"] if post["type"] == "article": post["articleBody"] = content_body.copy() + if "blocks" in content_body: + content = [] + append = content.append + for block in content_body["blocks"]: + if "text" in block: + append(block["text"]) + if "links" in block: + for link in block["links"]: + append(link["url"]) + post["content"] = "\n".join(content) post["date"] = text.parse_datetime(post["publishedDatetime"]) post["text"] = content_body.get("text") if content_body else None @@ -271,6 +281,19 @@ class FanboxPostExtractor(FanboxExtractor): "hasAdultContent": True }, }), + # 'content' metadata (#3020) + ("https://www.fanbox.cc/@official-en/posts/4326303", { + "keyword": { + "content": r"re:(?s)^Greetings from FANBOX.\n \nAs of Monday, " + r"September 5th, 2022, we are happy to announce " + r"the start of the FANBOX hashtag event " + r"#MySetupTour ! \nAbout the event\nTo join this " + r"event .+ \nPlease check this page for further " + r"details regarding the Privacy & Terms.\n" + r"https://fanbox.pixiv.help/.+/10184952456601\n\n\n" + r"Thank you for your continued support of FANBOX.$", + }, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index bece905..69c07d0 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -27,9 +27,9 @@ class GenericExtractor(Extractor): pattern += r""" (?P<scheme>https?://)? # optional http(s) scheme (?P<domain>[-\w\.]+) # required domain - (?P<path>/[^?&#]*)? # optional path - (?:\?(?P<query>[^/?#]*))? # optional query - (?:\#(?P<fragment>.*))?$ # optional fragment + (?P<path>/[^?#]*)? # optional path + (?:\?(?P<query>[^#]*))? # optional query + (?:\#(?P<fragment>.*))? # optional fragment """ def __init__(self, match): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index f8b0c3b..cc110aa 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -115,12 +115,16 @@ class HitomiGalleryExtractor(GalleryExtractor): fmt = self.config("format") or "webp" if fmt == "original": - subdomain, fmt, ext = "b", "images", None + subdomain, fmt, ext, check = "b", "images", None, False else: - subdomain, ext = "a", fmt + subdomain, ext, check = "a", fmt, True result = [] for image in self.info["files"]: + if check: + if not image.get("has" + fmt): + fmt = ext = "webp" + check = False ihash = image["hash"] idata = text.nameext_from_url(image["name"]) if ext: diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index b1c0e9e..2c899eb 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -44,7 +44,9 @@ class ImagefapGalleryExtractor(ImagefapExtractor): ("https://www.imagefap.com/gallery/5486966", { "pattern": r"https://cdnh?\.imagefap\.com" r"/images/full/\d+/\d+/\d+\.jpg", - "keyword": "3e24eace5b09639b881ebd393165862feb46adde", + "keyword": "8d2e562df7a0bc9e8eecb9d1bb68d32b4086bf98", + "archive": False, + "count": 62, }), ("https://www.imagefap.com/gallery.php?gid=7102714"), ("https://beta.imagefap.com/gallery.php?gid=7102714"), @@ -73,32 +75,42 @@ class ImagefapGalleryExtractor(ImagefapExtractor): title, _, descr = descr.partition(" porn picture gallery by ") uploader, _, tags = descr.partition(" to see hottest ") + self._count = text.parse_int(count) return { "gallery_id": text.parse_int(self.gid), "title": text.unescape(title), "uploader": uploader, "tags": tags[:-11].split(", "), - "count": text.parse_int(count), + "count": self._count, } def get_images(self): """Collect image-urls and -metadata""" - num = 0 url = "{}/photo/{}/".format(self.root, self.image_id) params = {"gid": self.gid, "idx": 0, "partial": "true"} + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "X-Requested-With": "XMLHttpRequest", + "Referer": "{}?pgid=&gid={}&page=0".format(url, self.image_id) + } + + num = 0 + total = self._count while True: - pos = 0 - page = self.request(url, params=params).text - for _ in range(24): - imgurl, pos = text.extract(page, '<a href="', '"', pos) - if not imgurl: - return + page = self.request(url, params=params, headers=headers).text + + cnt = 0 + for image_url in text.extract_iter(page, '<a href="', '"'): num += 1 - data = text.nameext_from_url(imgurl) + cnt += 1 + data = text.nameext_from_url(image_url) data["num"] = num data["image_id"] = text.parse_int(data["filename"]) - yield imgurl, data - params["idx"] += 24 + yield image_url, data + + if cnt < 24 and num >= total: + return + params["idx"] += cnt class ImagefapImageExtractor(ImagefapExtractor): @@ -170,40 +182,49 @@ class ImagefapUserExtractor(ImagefapExtractor): self.user, self.user_id = match.groups() def items(self): - for gid, name in self.get_gallery_data(): - url = "{}/gallery/{}".format(self.root, gid) - data = { - "gallery_id": text.parse_int(gid), - "title": text.unescape(name), - "_extractor": ImagefapGalleryExtractor, - } - yield Message.Queue, url, data - - def get_gallery_data(self): - """Yield all gallery_ids of a specific user""" - folders = self.get_gallery_folders() - url = "{}/ajax_usergallery_folder.php".format(self.root) - params = {"userid": self.user_id} - for folder_id in folders: - params["id"] = folder_id - page = self.request(url, params=params).text - - pos = 0 - while True: - gid, pos = text.extract(page, '<a href="/gallery/', '"', pos) - if not gid: - break - name, pos = text.extract(page, "<b>", "<", pos) - yield gid, name - - def get_gallery_folders(self): - """Create a list of all folder_ids of a specific user""" + for folder_id in self.folders(): + for gallery_id, name in self.galleries(folder_id): + url = "{}/gallery/{}".format(self.root, gallery_id) + data = { + "gallery_id": text.parse_int(gallery_id), + "title" : text.unescape(name), + "_extractor": ImagefapGalleryExtractor, + } + yield Message.Queue, url, data + + def folders(self): + """Return a list of folder_ids of a specific user""" if self.user: url = "{}/profile/{}/galleries".format(self.root, self.user) else: url = "{}/usergallery.php?userid={}".format( self.root, self.user_id) - page = self.request(url).text - self.user_id, pos = text.extract(page, '?userid=', '"') - folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos) - return folders.split("|")[:-1] + + response = self.request(url) + self.user = response.url.split("/")[-2] + folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0] + return folders.rstrip("|").split("|") + + def galleries(self, folder_id): + """Yield gallery_ids of a folder""" + if folder_id == "-1": + url = "{}/profile/{}/galleries?folderid=-1".format( + self.root, self.user) + else: + url = "{}/organizer/{}/".format(self.root, folder_id) + params = {"page": 0} + + while True: + extr = text.extract_from(self.request(url, params=params).text) + cnt = 0 + + while True: + gid = extr('<a href="/gallery/', '"') + if not gid: + break + yield gid, extr("<b>", "<") + cnt += 1 + + if cnt < 25: + break + params["page"] += 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 425d541..4775613 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -338,6 +338,14 @@ class InstagramExtractor(Extractor): "username" : user["username"], "full_name": user["full_name"]}) + def _init_cursor(self): + return self.config("cursor") or None + + def _update_cursor(self, cursor): + self.log.debug("Cursor: %s", cursor) + self._cursor = cursor + return cursor + class InstagramUserExtractor(InstagramExtractor): """Extractor for an Instagram user profile""" @@ -409,8 +417,8 @@ class InstagramTaggedExtractor(InstagramExtractor): self.user_id = self.item[3:] return {"tagged_owner_id": self.user_id} + self.user_id = self.api.user_id(self.item) user = self.api.user(self.item) - self.user_id = user["id"] return { "tagged_owner_id" : user["id"], @@ -693,7 +701,15 @@ class InstagramRestAPI(): def user_id(self, screen_name): if screen_name.startswith("id:"): return screen_name[3:] - return self.user(screen_name)["id"] + user = self.user(screen_name) + if user is None: + raise exception.AuthorizationError( + "Login required to access this profile") + if user["is_private"] and not user["followed_by_viewer"]: + name = user["username"] + s = "" if name.endswith("s") else "s" + raise exception.StopExtraction("%s'%s posts are private", name, s) + return user["id"] def user_clips(self, user_id): endpoint = "/v1/clips/user/" @@ -741,6 +757,9 @@ class InstagramRestAPI(): def _pagination(self, endpoint, params=None, media=False): if params is None: params = {} + extr = self.extractor + params["max_id"] = extr._init_cursor() + while True: data = self._call(endpoint, params=params) @@ -752,9 +771,12 @@ class InstagramRestAPI(): if not data.get("more_available"): return - params["max_id"] = data["next_max_id"] + params["max_id"] = extr._update_cursor(data["next_max_id"]) def _pagination_post(self, endpoint, params): + extr = self.extractor + params["max_id"] = extr._init_cursor() + while True: data = self._call(endpoint, method="POST", data=params) @@ -764,9 +786,12 @@ class InstagramRestAPI(): info = data["paging_info"] if not info.get("more_available"): return - params["max_id"] = info["max_id"] + params["max_id"] = extr._update_cursor(info["max_id"]) def _pagination_sections(self, endpoint, params): + extr = self.extractor + params["max_id"] = extr._init_cursor() + while True: info = self._call(endpoint, method="POST", data=params) @@ -774,19 +799,22 @@ class InstagramRestAPI(): if not info.get("more_available"): return - params["max_id"] = info["next_max_id"] params["page"] = info["next_page"] + params["max_id"] = extr._update_cursor(info["next_max_id"]) class InstagramGraphqlAPI(): def __init__(self, extractor): self.extractor = extractor - self.user = InstagramRestAPI(extractor).user self.user_collection = self.user_saved = self.reels_media = \ self.highlights_media = self._login_required self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + api = InstagramRestAPI(extractor) + self.user = api.user + self.user_id = api.user_id + @staticmethod def _login_required(_=None): raise exception.AuthorizationError("Login required") @@ -824,11 +852,6 @@ class InstagramGraphqlAPI(): return self._pagination(query_hash, variables, "hashtag", "edge_hashtag_to_media") - def user_id(self, screen_name): - if screen_name.startswith("id:"): - return screen_name[3:] - return self.user(screen_name)["id"] - def user_clips(self, user_id): query_hash = "bc78b344a68ed16dd5d7f264681c4c76" variables = {"id": user_id, "first": 50} @@ -871,9 +894,8 @@ class InstagramGraphqlAPI(): def _pagination(self, query_hash, variables, key_data="user", key_edge=None): - cursor = self.extractor.config("cursor") - if cursor: - variables["after"] = cursor + extr = self.extractor + variables["after"] = extr._init_cursor() while True: data = self._call(query_hash, variables)[key_data] @@ -890,35 +912,55 @@ class InstagramGraphqlAPI(): raise exception.StopExtraction( "%s'%s posts are private", self.item, s) - variables["after"] = self._cursor = info["end_cursor"] - self.extractor.log.debug("Cursor: %s", self._cursor) + variables["after"] = extr._update_cursor(info["end_cursor"]) -@cache(maxage=360*24*3600, keyarg=1) +@cache(maxage=90*24*3600, keyarg=1) def _login_impl(extr, username, password): extr.log.info("Logging in as %s", username) + user_agent = ("Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/106.0.5249.79 Mobile " + "Safari/537.36 Instagram 255.1.0.17.102") + + headers = { + "User-Agent" : user_agent, + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + } url = extr.root + "/accounts/login/" - page = extr.request(url).text + response = extr.request(url, headers=headers) + + extract = text.extract_from(response.text) + csrf_token = extract('"csrf_token":"', '"') + device_id = extract('"device_id":"', '"') + rollout_hash = extract('"rollout_hash":"', '"') + + cset = extr.session.cookies.set + cset("csrftoken", csrf_token, domain=extr.cookiedomain) + cset("ig_did", device_id, domain=extr.cookiedomain) headers = { - "X-Web-Device-Id" : text.extract(page, '"device_id":"', '"')[0], + "User-Agent" : user_agent, + "Accept" : "*/*", + "X-CSRFToken" : csrf_token, + "X-Instagram-AJAX": rollout_hash, "X-IG-App-ID" : "936619743392459", - "X-ASBD-ID" : "437806", + "X-ASBD-ID" : "198387", "X-IG-WWW-Claim" : "0", "X-Requested-With": "XMLHttpRequest", + "Origin" : extr.root, "Referer" : url, + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", } - url = extr.root + "/data/shared_data/" - data = extr.request(url, headers=headers).json() - - headers["X-CSRFToken"] = data["config"]["csrf_token"] - headers["X-Instagram-AJAX"] = data["rollout_hash"] - headers["Origin"] = extr.root data = { - "username" : username, - "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format( + "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format( int(time.time()), password), + "username" : username, "queryParams" : "{}", "optIntoOneTap" : "false", "stopDeletionNonce" : "", @@ -930,11 +972,8 @@ def _login_impl(extr, username, password): if not response.json().get("authenticated"): raise exception.AuthenticationError() - cget = extr.session.cookies.get - return { - name: cget(name) - for name in ("sessionid", "mid", "ig_did") - } + return {cookie.name: cookie.value + for cookie in extr.session.cookies} def id_from_shortcode(shortcode): diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py new file mode 100644 index 0000000..6062418 --- /dev/null +++ b/gallery_dl/extractor/nana.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nana.my.id/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +import json + + +class NanaGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from nana.my.id""" + category = "nana" + directory_fmt = ("{category}", "{title}") + pattern = r"(?:https?://)?nana\.my\.id/reader/([^/?#]+)" + test = ( + (("https://nana.my.id/reader/" + "059f7de55a4297413bfbd432ce7d6e724dd42bae"), { + "pattern": r"https://nana\.my\.id/reader/" + r"\w+/image/page\?path=.*\.\w+", + "title" : "Everybody Loves Shion", + "artist" : "fuzui", + "tags" : list, + "count" : 29, + }), + (("https://nana.my.id/reader/" + "77c8712b67013e427923573379f5bafcc0c72e46"), { + "pattern": r"https://nana\.my\.id/reader/" + r"\w+/image/page\?path=.*\.\w+", + "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru", + "artist" : "Sueyuu", + "tags" : ["Sueyuu"], + "count" : 58, + }), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "https://nana.my.id/reader/" + self.gallery_id + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + title = text.unescape( + text.extract(page, '</a> ', '</div>')[0]) + artist = text.unescape(text.extract( + page, '<title>', '</title>')[0])[len(title):-10] + tags = text.extract(page, 'Reader.tags = "', '"')[0] + + return { + "gallery_id": self.gallery_id, + "title" : title, + "artist" : artist[4:] if artist.startswith(" by ") else "", + "tags" : tags.split(", ") if tags else (), + "lang" : "en", + "language" : "English", + } + + def images(self, page): + data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0]) + return [ + ("https://nana.my.id" + image, None) + for image in data["pages"] + ] + + +class NanaSearchExtractor(Extractor): + """Extractor for nana search results""" + category = "nana" + subcategory = "search" + pattern = r"(?:https?://)?nana\.my\.id(?:/?\?([^#]+))" + test = ( + ('https://nana.my.id/?q=+"elf"&sort=desc', { + "pattern": NanaGalleryExtractor.pattern, + "range": "1-100", + "count": 100, + }), + ("https://nana.my.id/?q=favorites%3A", { + "pattern": NanaGalleryExtractor.pattern, + "count": ">= 2", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + self.params["p"] = text.parse_int(self.params.get("p"), 1) + self.params["q"] = self.params.get("q") or "" + + def items(self): + if "favorites:" in self.params["q"]: + favkey = self.config("favkey") + if not favkey: + raise exception.AuthenticationError( + "'Favorite key' not provided. " + "Please see 'https://nana.my.id/tutorial'") + self.session.cookies.set("favkey", favkey, domain="nana.my.id") + + data = {"_extractor": NanaGalleryExtractor} + while True: + try: + page = self.request( + "https://nana.my.id", params=self.params).text + except exception.HttpError: + return + + for gallery in text.extract_iter( + page, '<div class="id3">', '</div>'): + url = "https://nana.my.id" + text.extract( + gallery, '<a href="', '"')[0] + yield Message.Queue, url, data + + self.params["p"] += 1 diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 2c8e72c..73911b2 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -326,6 +326,55 @@ class NijieNuitaExtractor(NijieExtractor): page, "<title>", "さんの抜いた")[0] or "") +class NijieFeedExtractor(NijieExtractor): + """Extractor for nijie liked user feed""" + subcategory = "feed" + pattern = BASE_PATTERN + r"/like_user_view\.php" + test = ( + ("https://nijie.info/like_user_view.php", { + "range": "1-10", + "count": 10, + }), + ("https://horne.red/like_user_view.php"), + ) + + def image_ids(self): + return self._pagination("like_user_view") + + @staticmethod + def _extract_user_name(page): + return "" + + +class NijiefollowedExtractor(NijieExtractor): + """Extractor for followed nijie users""" + subcategory = "followed" + pattern = BASE_PATTERN + r"/like_my\.php" + test = ( + ("https://nijie.info/like_my.php"), + ("https://horne.red/like_my.php"), + ) + + def items(self): + self.login() + + url = self.root + "/like_my.php" + params = {"p": 1} + data = {"_extractor": NijieUserExtractor} + + while True: + page = self.request(url, params=params).text + + for user_id in text.extract_iter( + page, '"><a href="/members.php?id=', '"'): + user_url = "{}/members.php?id={}".format(self.root, user_id) + yield Message.Queue, user_url, data + + if '<a rel="next"' not in page: + return + params["p"] += 1 + + class NijieImageExtractor(NijieExtractor): """Extractor for a nijie work/image""" subcategory = "image" diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 713330d..f381f12 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -62,10 +62,11 @@ class NozomiExtractor(Extractor): yield Message.Directory, post for post["num"], image in enumerate(images, 1): - post["url"] = url = text.urljoin(self.root, image["imageurl"]) - text.nameext_from_url(url, post) - post["is_video"] = bool(image.get("is_video")) - post["dataid"] = post["filename"] + post["filename"] = post["dataid"] = did = image["dataid"] + post["extension"] = ext = image["type"] + post["is_video"] = video = bool(image.get("is_video")) + post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format( + "v" if video else "i", did[-1], did[-3:-1], did, ext) yield Message.Url, url, post def posts(self): @@ -109,7 +110,6 @@ class NozomiPostExtractor(NozomiExtractor): "height" : 768, "is_video" : False, "postid" : 3649262, - "source" : "danbooru", "tags" : list, "type" : "jpg", "url" : str, @@ -119,7 +119,7 @@ class NozomiPostExtractor(NozomiExtractor): # multiple images per post ("https://nozomi.la/post/25588032.html", { "url": "6aa3b7db385abcc9d374bdffd19187bccbf8f228", - "keyword": "f60e048df36308b6b25dfaac419b586895d360bc", + "keyword": "2a2998af93c6438863c4077bd386b613b8bc2957", "count": 7, }), # empty 'date' (#1163) @@ -160,7 +160,7 @@ class NozomiTagExtractor(NozomiExtractor): archive_fmt = "t_{search_tags}_{dataid}" pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\." test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { - "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$", + "pattern": r"^https://[iv]\.nozomi\.la/\w/\w\w/\w+\.\w+$", "count": ">= 25", "range": "1-25", }) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 3a4fb0e..1111c3a 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text +from ..cache import cache class RedgifsExtractor(Extractor): @@ -88,7 +89,7 @@ class RedgifsSearchExtractor(RedgifsExtractor): pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)" test = ( ("https://www.redgifs.com/browse?tags=JAV", { - "pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4", + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.mp4", "range": "1-10", "count": 10, }), @@ -131,6 +132,12 @@ class RedgifsAPI(): def __init__(self, extractor): self.extractor = extractor + self.headers = { + "Referer" : extractor.root + "/", + "authorization": "Bearer " + self._fetch_bearer_token(extractor), + "content-type" : "application/json", + "Origin" : extractor.root, + } def gif(self, gif_id): endpoint = "/v2/gifs/" + gif_id.lower() @@ -149,7 +156,8 @@ class RedgifsAPI(): def _call(self, endpoint, params=None): url = self.API_ROOT + endpoint - return self.extractor.request(url, params=params).json() + return self.extractor.request( + url, params=params, headers=self.headers).json() def _pagination(self, endpoint, params): params["page"] = 1 @@ -161,3 +169,17 @@ class RedgifsAPI(): if params["page"] >= data["pages"]: return params["page"] += 1 + + @cache(maxage=3600) + def _fetch_bearer_token(self, extr): + extr.log.debug("Retrieving Bearer token") + + page = extr.request(extr.root + "/").text + index = text.extract(page, "/assets/js/index", ".js")[0] + + url = extr.root + "/assets/js/index" + index + ".js" + page = extr.request(url, encoding="utf-8").text + token = "ey" + text.extract(page, '="ey', '"')[0] + + extr.log.debug("Token: '%s'", token) + return token diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 447ce00..324a3c6 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -17,7 +17,7 @@ import re BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?" - r"(?:www\.tumblr\.com/blog/(?:view/)?([\w-]+)|" + r"(?:www\.tumblr\.com/(?:blog/(?:view/)?)?([\w-]+)|" r"([\w-]+\.tumblr\.com)))" ) @@ -250,9 +250,9 @@ class TumblrExtractor(Extractor): return updated, (resized == updated) def _original_image_fallback(self, url, post_id): - yield self._update_image_token(url)[0] - yield self._update_image_token(url)[0] - yield self._update_image_token(url)[0] + for _ in range(3): + self.sleep(120, "image token") + yield self._update_image_token(url)[0] self.log.warning("Unable to fetch higher-resolution " "version of %s (%s)", url, post_id) @@ -298,6 +298,7 @@ class TumblrUserExtractor(TumblrExtractor): ("tumblr:www.b-authentique.com"), ("https://www.tumblr.com/blog/view/smarties-art"), ("https://www.tumblr.com/blog/smarties-art"), + ("https://www.tumblr.com/smarties-art"), ) def posts(self): @@ -354,6 +355,8 @@ class TumblrPostExtractor(TumblrExtractor): }), ("http://demo.tumblr.com/image/459265350"), ("https://www.tumblr.com/blog/view/smarties-art/686047436641353728"), + ("https://www.tumblr.com/blog/smarties-art/686047436641353728"), + ("https://www.tumblr.com/smarties-art/686047436641353728"), ) def __init__(self, match): @@ -381,6 +384,8 @@ class TumblrTagExtractor(TumblrExtractor): "count": 1, }), ("https://www.tumblr.com/blog/view/smarties-art/tagged/undertale"), + ("https://www.tumblr.com/blog/smarties-art/tagged/undertale"), + ("https://www.tumblr.com/smarties-art/tagged/undertale"), ) def __init__(self, match): @@ -402,6 +407,8 @@ class TumblrLikesExtractor(TumblrExtractor): "count": 1, }), ("https://www.tumblr.com/blog/view/mikf123/likes"), + ("https://www.tumblr.com/blog/mikf123/likes"), + ("https://www.tumblr.com/mikf123/likes"), ) def posts(self): @@ -435,11 +442,15 @@ class TumblrAPI(oauth.OAuth1API): def posts(self, blog, params): """Retrieve published posts""" - params.update({"offset": 0, "limit": 50, "reblog_info": "true"}) + params["offset"] = self.extractor.config("offset") or 0 + params["limit"] = 50 + params["reblog_info"] = "true" + if self.posts_type: params["type"] = self.posts_type if self.before: params["before"] = self.before + while True: data = self._call(blog, "posts", params) self.BLOG_CACHE[blog] = data["blog"] diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index 623ed94..8bea18c 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -210,7 +210,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor): class UnsplashSearchExtractor(UnsplashExtractor): """Extractor for unsplash search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?" + pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?" test = ("https://unsplash.com/s/photos/hair-style", { "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+" r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 25b00fe..9b6831b 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -79,7 +79,8 @@ class VkExtractor(Extractor): if len(payload) < 4: self.log.debug(payload) - raise exception.AuthorizationError(payload[0]) + raise exception.AuthorizationError( + text.unescape(payload[0]) if payload[0] else None) total = payload[1] photos = payload[3] @@ -103,7 +104,7 @@ class VkPhotosExtractor(VkExtractor): subcategory = "photos" pattern = (BASE_PATTERN + r"/(?:" r"(?:albums|photos|id)(-?\d+)" - r"|(?!album-?\d+_)([^/?#]+))") + r"|(?!(?:album|tag)-?\d+_?)([^/?#]+))") test = ( ("https://vk.com/id398982326", { "pattern": r"https://sun\d+-\d+\.userapi\.com/s/v1/if1" @@ -182,9 +183,6 @@ class VkAlbumExtractor(VkExtractor): directory_fmt = ("{category}", "{user[id]}", "{album[id]}") pattern = BASE_PATTERN + r"/album(-?\d+)_(\d+)$" test = ( - ("https://vk.com/album232175027_00", { - "count": 8, - }), ("https://vk.com/album-165740836_281339889", { "count": 12, }), @@ -192,6 +190,9 @@ class VkAlbumExtractor(VkExtractor): ("https://vk.com/album-53775183_00", { "exception": exception.AuthorizationError, }), + ("https://vk.com/album232175027_00", { + "exception": exception.AuthorizationError, + }), ) def __init__(self, match): @@ -207,3 +208,25 @@ class VkAlbumExtractor(VkExtractor): "user": {"id": self.user_id}, "album": {"id": self.album_id}, } + + +class VkTaggedExtractor(VkExtractor): + """Extractor for a vk tagged photos""" + subcategory = "tagged" + directory_fmt = ("{category}", "{user[id]}", "tags") + pattern = BASE_PATTERN + r"/tag(-?\d+)$" + test = ( + ("https://vk.com/tag304303884", { + "count": 44, + }), + ) + + def __init__(self, match): + VkExtractor.__init__(self, match) + self.user_id = match.group(1) + + def photos(self): + return self._pagination("tag{}".format(self.user_id)) + + def metadata(self): + return {"user": {"id": self.user_id}} diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 0ad8523..47451bd 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -52,7 +52,7 @@ class WallhavenSearchExtractor(WallhavenExtractor): subcategory = "search" directory_fmt = ("{category}", "{search[q]}") archive_fmt = "s_{search[q]}_{id}" - pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?" + pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^#]+))?" test = ( ("https://wallhaven.cc/search?q=touhou"), (("https://wallhaven.cc/search?q=id%3A87" diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 7b22b1d..2f48ffd 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -8,7 +8,6 @@ import sys import json -import time import errno import logging import functools @@ -74,9 +73,10 @@ class Job(): log = extractor.log msg = None - sleep = util.build_duration_func(extractor.config("sleep-extractor")) + sleep = util.build_duration_func( + extractor.config("sleep-extractor")) if sleep: - time.sleep(sleep()) + extractor.sleep(sleep(), "extractor") try: for msg in extractor: @@ -238,7 +238,7 @@ class DownloadJob(Job): return if self.sleep: - time.sleep(self.sleep()) + self.extractor.sleep(self.sleep(), "download") # download from URL if not self.download(url): @@ -527,11 +527,11 @@ class SimulationJob(DownloadJob): if not kwdict["extension"]: kwdict["extension"] = "jpg" self.pathfmt.set_filename(kwdict) - self.out.skip(self.pathfmt.path) if self.sleep: - time.sleep(self.sleep()) + self.extractor.sleep(self.sleep(), "download") if self.archive: self.archive.add(kwdict) + self.out.skip(self.pathfmt.path) def handle_directory(self, kwdict): if not self.pathfmt: @@ -697,17 +697,18 @@ class DataJob(Job): self.ascii = config.get(("output",), "ascii", ensure_ascii) private = config.get(("output",), "private") - self.filter = util.identity if private else util.filter_dict + self.filter = dict.copy if private else util.filter_dict def run(self): + extractor = self.extractor sleep = util.build_duration_func( - self.extractor.config("sleep-extractor")) + extractor.config("sleep-extractor")) if sleep: - time.sleep(sleep()) + extractor.sleep(sleep(), "extractor") # collect data try: - for msg in self.extractor: + for msg in extractor: self.dispatch(msg) except exception.StopExtraction: pass diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 84ee7af..28c07c3 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -105,6 +105,9 @@ class PathFormat(): strip = ". " self.strip = strip + if WINDOWS: + self.extended = config("path-extended", True) + basedir = extractor._parentdir if not basedir: basedir = config("base-directory") @@ -178,7 +181,7 @@ class PathFormat(): else: self.directory = directory = self.basedirectory - if WINDOWS: + if WINDOWS and self.extended: # Enable longer-than-260-character paths directory = os.path.abspath(directory) if directory.startswith("\\\\"): diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index d9baed3..b21e483 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -19,15 +19,9 @@ class MetadataPP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) - mode = options.get("mode", "json") - if mode == "custom": - self.write = self._write_custom - cfmt = options.get("content-format") or options.get("format") - if isinstance(cfmt, list): - cfmt = "\n".join(cfmt) + "\n" - self._content_fmt = formatter.parse(cfmt).format_map - ext = "txt" - elif mode == "tags": + mode = options.get("mode") + cfmt = options.get("content-format") or options.get("format") + if mode == "tags": self.write = self._write_tags ext = "txt" elif mode == "modify": @@ -41,6 +35,12 @@ class MetadataPP(PostProcessor): self.run = self._run_delete self.fields = options.get("fields") ext = None + elif mode == "custom" or not mode and cfmt: + self.write = self._write_custom + if isinstance(cfmt, list): + cfmt = "\n".join(cfmt) + "\n" + self._content_fmt = formatter.parse(cfmt).format_map + ext = "txt" else: self.write = self._write_json self.indent = options.get("indent", 4) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 4ba1cba..1650b0a 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -418,6 +418,82 @@ CODES = { } +def parse_inputfile(file, log): + """Filter and process strings from an input file. + + Lines starting with '#' and empty lines will be ignored. + Lines starting with '-' will be interpreted as a key-value pair separated + by an '='. where 'key' is a dot-separated option name and 'value' is a + JSON-parsable value. These configuration options will be applied while + processing the next URL. + Lines starting with '-G' are the same as above, except these options will + be applied for *all* following URLs, i.e. they are Global. + Everything else will be used as a potential URL. + + Example input file: + + # settings global options + -G base-directory = "/tmp/" + -G skip = false + + # setting local options for the next URL + -filename="spaces_are_optional.jpg" + -skip = true + + https://example.org/ + + # next URL uses default filename and 'skip' is false. + https://example.com/index.htm # comment1 + https://example.com/404.htm # comment2 + """ + gconf = [] + lconf = [] + strip_comment = None + + for line in file: + line = line.strip() + + if not line or line[0] == "#": + # empty line or comment + continue + + elif line[0] == "-": + # config spec + if len(line) >= 2 and line[1] == "G": + conf = gconf + line = line[2:] + else: + conf = lconf + line = line[1:] + + key, sep, value = line.partition("=") + if not sep: + log.warning("input file: invalid <key>=<value> pair: %s", line) + continue + + try: + value = json.loads(value.strip()) + except ValueError as exc: + log.warning("input file: unable to parse '%s': %s", value, exc) + continue + + key = key.strip().split(".") + conf.append((key[:-1], key[-1], value)) + + else: + # url + if " #" in line or "\t#" in line: + if strip_comment is None: + strip_comment = re.compile(r"\s+#.*").sub + line = strip_comment("", line) + if gconf or lconf: + yield ExtendedUrl(line, gconf, lconf) + gconf = [] + lconf = [] + else: + yield line + + class UniversalNone(): """None-style object that supports more operations than None itself""" __slots__ = () diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 13cb9a0..f758857 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.23.2" +__version__ = "1.23.3" |
