diff options
Diffstat (limited to 'gallery_dl')
49 files changed, 1297 insertions, 396 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 1d4215e..4b39c15 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -105,6 +105,11 @@ def main(): output.ANSI = True + # filter environment + filterenv = config.get((), "filters-environment", True) + if not filterenv: + util.compile_expression = util.compile_expression_raw + # format string separator separator = config.get((), "format-separator") if separator: @@ -145,6 +150,10 @@ def main(): log.debug("Configuration Files %s", config._files) + if args.print_traffic: + import requests + requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1 + # extractor modules modules = config.get(("extractor",), "modules") if modules is not None: @@ -240,6 +249,9 @@ def main(): if config.get(("output",), "fallback", True): jobtype.handle_url = \ staticmethod(jobtype.handle_url_fallback) + elif args.dump_json: + jobtype = job.DataJob + jobtype.resolve = args.dump_json - 1 else: jobtype = args.jobtype or job.DownloadJob @@ -299,6 +311,8 @@ def main(): else: input_manager.success() + except exception.StopExtraction: + pass except exception.TerminateExtraction: pass except exception.RestartExtraction: diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py index 883e38b..668032d 100644 --- a/gallery_dl/actions.py +++ b/gallery_dl/actions.py @@ -9,8 +9,10 @@ """ """ import re +import time import logging import operator +import functools from . import util, exception @@ -19,29 +21,100 @@ def parse(actionspec): actionspec = actionspec.items() actions = {} - actions[logging.DEBUG] = actions_d = [] - actions[logging.INFO] = actions_i = [] - actions[logging.WARNING] = actions_w = [] - actions[logging.ERROR] = actions_e = [] + actions[-logging.DEBUG] = actions_bd = [] + actions[-logging.INFO] = actions_bi = [] + actions[-logging.WARNING] = actions_bw = [] + actions[-logging.ERROR] = actions_be = [] + actions[logging.DEBUG] = actions_ad = [] + actions[logging.INFO] = actions_ai = [] + actions[logging.WARNING] = actions_aw = [] + actions[logging.ERROR] = actions_ae = [] for event, spec in actionspec: level, _, pattern = event.partition(":") - type, _, args = spec.partition(" ") - action = (re.compile(pattern).search, ACTIONS[type](args)) + search = re.compile(pattern).search if pattern else util.true + + if isinstance(spec, str): + type, _, args = spec.partition(" ") + before, after = ACTIONS[type](args) + else: + actions_before = [] + actions_after = [] + for s in spec: + type, _, args = s.partition(" ") + before, after = ACTIONS[type](args) + if before: + actions_before.append(before) + if after: + actions_after.append(after) + before = _chain_actions(actions_before) + after = _chain_actions(actions_after) level = level.strip() if not level or level == "*": - actions_d.append(action) - actions_i.append(action) - actions_w.append(action) - actions_e.append(action) + if before: + action = (search, before) + actions_bd.append(action) + actions_bi.append(action) + actions_bw.append(action) + actions_be.append(action) + if after: + action = (search, after) + actions_ad.append(action) + actions_ai.append(action) + actions_aw.append(action) + actions_ae.append(action) else: - - actions[_level_to_int(level)].append(action) + level = _level_to_int(level) + if before: + actions[-level].append((search, before)) + if after: + actions[level].append((search, after)) return actions +class LoggerAdapter(): + + def __init__(self, logger, job): + self.logger = logger + self.extra = job._logger_extra + self.actions = job._logger_actions + + self.debug = functools.partial(self.log, logging.DEBUG) + self.info = functools.partial(self.log, logging.INFO) + self.warning = functools.partial(self.log, logging.WARNING) + self.error = functools.partial(self.log, logging.ERROR) + + def log(self, level, msg, *args, **kwargs): + msg = str(msg) + if args: + msg = msg % args + + before = self.actions[-level] + after = self.actions[level] + + if before: + args = self.extra.copy() + args["level"] = level + + for cond, action in before: + if cond(msg): + action(args) + + level = args["level"] + + if self.logger.isEnabledFor(level): + kwargs["extra"] = self.extra + self.logger._log(level, msg, (), **kwargs) + + if after: + args = self.extra.copy() + for cond, action in after: + if cond(msg): + action(args) + + def _level_to_int(level): try: return logging._nameToLevel[level] @@ -49,10 +122,19 @@ def _level_to_int(level): return int(level) +def _chain_actions(actions): + def _chain(args): + for action in actions: + action(args) + return _chain + + +# -------------------------------------------------------------------- + def action_print(opts): def _print(_): print(opts) - return _print + return None, _print def action_status(opts): @@ -69,7 +151,7 @@ def action_status(opts): def _status(args): args["job"].status = op(args["job"].status, value) - return _status + return _status, None def action_level(opts): @@ -77,17 +159,38 @@ def action_level(opts): def _level(args): args["level"] = level - return _level + return _level, None + + +def action_exec(opts): + def _exec(_): + util.Popen(opts, shell=True).wait() + return None, _exec def action_wait(opts): - def _wait(args): - input("Press Enter to continue") - return _wait + if opts: + seconds = util.build_duration_func(opts) + + def _wait(args): + time.sleep(seconds()) + else: + def _wait(args): + input("Press Enter to continue") + + return None, _wait + + +def action_abort(opts): + return None, util.raises(exception.StopExtraction) + + +def action_terminate(opts): + return None, util.raises(exception.TerminateExtraction) def action_restart(opts): - return util.raises(exception.RestartExtraction) + return None, util.raises(exception.RestartExtraction) def action_exit(opts): @@ -98,14 +201,17 @@ def action_exit(opts): def _exit(args): raise SystemExit(opts) - return _exit + return None, _exit ACTIONS = { - "print" : action_print, - "status" : action_status, - "level" : action_level, - "restart": action_restart, - "wait" : action_wait, - "exit" : action_exit, + "abort" : action_abort, + "exec" : action_exec, + "exit" : action_exit, + "level" : action_level, + "print" : action_print, + "restart" : action_restart, + "status" : action_status, + "terminate": action_terminate, + "wait" : action_wait, } diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 47f78a7..f017929 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -25,7 +25,7 @@ from . import aes, text, util SUPPORTED_BROWSERS_CHROMIUM = { - "brave", "chrome", "chromium", "edge", "opera", "vivaldi"} + "brave", "chrome", "chromium", "edge", "opera", "thorium", "vivaldi"} SUPPORTED_BROWSERS = SUPPORTED_BROWSERS_CHROMIUM | {"firefox", "safari"} logger = logging.getLogger("cookies") @@ -354,6 +354,7 @@ def _get_chromium_based_browser_settings(browser_name): "chromium": join(appdata_local, R"Chromium\User Data"), "edge" : join(appdata_local, R"Microsoft\Edge\User Data"), "opera" : join(appdata_roaming, R"Opera Software\Opera Stable"), + "thorium" : join(appdata_local, R"Thorium\User Data"), "vivaldi" : join(appdata_local, R"Vivaldi\User Data"), }[browser_name] @@ -365,6 +366,7 @@ def _get_chromium_based_browser_settings(browser_name): "chromium": join(appdata, "Chromium"), "edge" : join(appdata, "Microsoft Edge"), "opera" : join(appdata, "com.operasoftware.Opera"), + "thorium" : join(appdata, "Thorium"), "vivaldi" : join(appdata, "Vivaldi"), }[browser_name] @@ -377,6 +379,7 @@ def _get_chromium_based_browser_settings(browser_name): "chromium": join(config, "chromium"), "edge" : join(config, "microsoft-edge"), "opera" : join(config, "opera"), + "thorium" : join(config, "Thorium"), "vivaldi" : join(config, "vivaldi"), }[browser_name] @@ -390,6 +393,7 @@ def _get_chromium_based_browser_settings(browser_name): "edge" : "Microsoft Edge" if sys.platform == "darwin" else "Chromium", "opera" : "Opera" if sys.platform == "darwin" else "Chromium", + "thorium" : "Thorium", "vivaldi" : "Vivaldi" if sys.platform == "darwin" else "Chrome", }[browser_name] diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index a4b0997..a5e8b27 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,7 +27,8 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + self.cookies.set( + "TOS20240718", "1", domain=self.root.rpartition("/")[2]) @memcache() def cookies_prepare(self): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6aff1f3..e103cb1 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -22,6 +22,7 @@ modules = [ "8chan", "8muses", "adultempire", + "agnph", "architizer", "artstation", "aryion", @@ -33,6 +34,7 @@ modules = [ "bunkr", "catbox", "chevereto", + "cien", "comicvine", "cyberdrop", "danbooru", @@ -42,7 +44,6 @@ modules = [ "e621", "erome", "exhentai", - "fallenangels", "fanbox", "fanleaks", "fantia", @@ -84,6 +85,7 @@ modules = [ "keenspot", "kemonoparty", "khinsider", + "koharu", "komikcast", "lensdump", "lexica", diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py new file mode 100644 index 0000000..653b73f --- /dev/null +++ b/gallery_dl/extractor/agnph.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://agn.ph/""" + +from . import booru +from .. import text + +from xml.etree import ElementTree +import collections +import re + +BASE_PATTERN = r"(?:https?://)?agn\.ph" + + +class AgnphExtractor(booru.BooruExtractor): + category = "agnph" + root = "https://agn.ph" + page_start = 1 + per_page = 45 + + TAG_TYPES = { + "a": "artist", + "b": "copyright", + "c": "character", + "d": "species", + "m": "general", + } + + def _init(self): + self.cookies.set("confirmed_age", "true", domain="agn.ph") + + def _prepare(self, post): + post["date"] = text.parse_timestamp(post["created_at"]) + post["status"] = post["status"].strip() + post["has_children"] = ("true" in post["has_children"]) + + def _xml_to_dict(self, xml): + return {element.tag: element.text for element in xml} + + def _pagination(self, url, params): + params["api"] = "xml" + if "page" in params: + params["page"] = \ + self.page_start + text.parse_int(params["page"]) - 1 + else: + params["page"] = self.page_start + + while True: + data = self.request(url, params=params).text + root = ElementTree.fromstring(data) + + yield from map(self._xml_to_dict, root) + + attrib = root.attrib + if int(attrib["offset"]) + len(root) >= int(attrib["count"]): + return + + params["page"] += 1 + + def _html(self, post): + url = "{}/gallery/post/show/{}/".format(self.root, post["id"]) + return self.request(url).text + + def _tags(self, post, page): + tag_container = text.extr( + page, '<ul class="taglist">', '<h3>Statistics</h3>') + if not tag_container: + return + + tags = collections.defaultdict(list) + pattern = re.compile(r'class="(.)typetag">([^<]+)') + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) + for key, value in tags.items(): + post["tags_" + self.TAG_TYPES[key]] = " ".join(value) + + +class AgnphTagExtractor(AgnphExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/gallery/post/(?:\?([^#]+))?$" + example = "https://agn.ph/gallery/post/?search=TAG" + + def __init__(self, match): + AgnphExtractor.__init__(self, match) + self.params = text.parse_query(self.groups[0]) + + def metadata(self): + return {"search_tags": self.params.get("search") or ""} + + def posts(self): + url = self.root + "/gallery/post/" + return self._pagination(url, self.params.copy()) + + +class AgnphPostExtractor(AgnphExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/gallery/post/show/(\d+)" + example = "https://agn.ph/gallery/post/show/12345/" + + def posts(self): + url = "{}/gallery/post/show/{}/?api=xml".format( + self.root, self.groups[0]) + post = ElementTree.fromstring(self.request(url).text) + return (self._xml_to_dict(post),) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index ec86263..17b780e 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -79,18 +79,20 @@ class AryionExtractor(Extractor): def metadata(self): """Return general metadata""" - def _pagination_params(self, url, params=None): + def _pagination_params(self, url, params=None, needle=None): if params is None: params = {"p": 1} else: params["p"] = text.parse_int(params.get("p"), 1) + if needle is None: + needle = "class='gallery-item' id='" + while True: page = self.request(url, params=params).text cnt = 0 - for post_id in text.extract_iter( - page, "class='gallery-item' id='", "'"): + for post_id in text.extract_iter(page, needle, "'"): cnt += 1 yield post_id @@ -200,6 +202,21 @@ class AryionGalleryExtractor(AryionExtractor): return util.advance(self._pagination_next(url), self.offset) +class AryionFavoriteExtractor(AryionExtractor): + """Extractor for a user's favorites gallery""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user!l}", "favorites") + archive_fmt = "f_{user}_{id}" + categorytransfer = True + pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" + example = "https://aryion.com/g4/favorites/USER" + + def posts(self): + url = "{}/g4/favorites/{}".format(self.root, self.user) + return self._pagination_params( + url, None, "class='gallery-item favorite' id='") + + class AryionTagExtractor(AryionExtractor): """Extractor for tag searches on eka's portal""" subcategory = "tag" diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index ad0caf9..f24059f 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -152,8 +152,16 @@ class BehanceGalleryExtractor(BehanceExtractor): continue if mtype == "image": - url = module["imageSizes"]["size_original"]["url"] - append((url, module)) + sizes = { + size["url"].rsplit("/", 2)[1]: size + for size in module["imageSizes"]["allAvailable"] + } + size = (sizes.get("source") or + sizes.get("max_3840") or + sizes.get("fs") or + sizes.get("hd") or + sizes.get("disp")) + append((size["url"], module)) elif mtype == "video": try: diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index cbd0e07..7e26f38 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -29,16 +29,21 @@ class BooruExtractor(BaseExtractor): url_key = self.config("url") if url_key: - self._file_url = operator.itemgetter(url_key) + if isinstance(url_key, (list, tuple)): + self._file_url = self._file_url_list + self._file_url_keys = url_key + else: + self._file_url = operator.itemgetter(url_key) for post in self.posts(): try: url = self._file_url(post) if url[0] == "/": url = self.root + url - except (KeyError, TypeError): - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + self.log.warning("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) continue if fetch_html: @@ -73,6 +78,11 @@ class BooruExtractor(BaseExtractor): _file_url = operator.itemgetter("file_url") + def _file_url_list(self, post): + urls = (post[key] for key in self._file_url_keys if post.get(key)) + post["_fallback"] = it = iter(urls) + return next(it) + def _prepare(self, post): """Prepare a 'post's metadata""" diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index a093347..77f0de6 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -13,7 +13,7 @@ from .. import text BASE_PATTERN = ( r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|ru|la|is|to|ac|black|cat|media|red|site|ws))" + r"\.(?:s[kiu]|fi|ru|la|is|to|ac|black|cat|media|red|site|ws))" ) LEGACY_DOMAINS = { diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py new file mode 100644 index 0000000..bae86d0 --- /dev/null +++ b/gallery_dl/extractor/cien.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ci-en.net/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" + + +class CienExtractor(Extractor): + category = "cien" + root = "https://ci-en.net" + request_interval = (1.0, 2.0) + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + Extractor.__init__(self, match) + + def _init(self): + self.cookies.set("accepted_rating", "r18g", domain="ci-en.dlsite.com") + + def _pagination_articles(self, url, params): + data = {"_extractor": CienArticleExtractor} + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for card in text.extract_iter( + page, ' class="c-cardCase-item', '</div>'): + article_url = text.extr(card, ' href="', '"') + yield Message.Queue, article_url, data + + if ' rel="next"' not in page: + return + params["page"] += 1 + + +class CienArticleExtractor(CienExtractor): + subcategory = "article" + filename_fmt = "{num:>02} {filename}.{extension}" + directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}") + archive_fmt = "{post_id}_{num}" + pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + example = "https://ci-en.net/creator/123/article/12345" + + def items(self): + url = "{}/creator/{}/article/{}".format( + self.root, self.groups[0], self.groups[1]) + page = self.request(url, notfound="article").text + + post = util.json_loads(text.extr( + page, '<script type="application/ld+json">', '</script>'))[0] + + files = self._extract_files(post.get("articleBody") or page) + + post["post_url"] = url + post["post_id"] = text.parse_int(self.groups[1]) + post["count"] = len(files) + post["date"] = text.parse_datetime(post["datePublished"]) + + try: + del post["publisher"] + del post["sameAs"] + except Exception: + pass + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post.update(file) + if "extension" not in file: + text.nameext_from_url(file["url"], post) + yield Message.Url, file["url"], post + + def _extract_files(self, page): + files = [] + + filetypes = self.config("files") + if filetypes is None: + self._extract_files_image(page, files) + self._extract_files_video(page, files) + self._extract_files_download(page, files) + self._extract_files_gallery(page, files) + else: + generators = { + "image" : self._extract_files_image, + "video" : self._extract_files_video, + "download": self._extract_files_download, + "gallery" : self._extract_files_gallery, + "gallerie": self._extract_files_gallery, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + for ft in filetypes: + generators[ft.rstrip("s")](page, files) + + return files + + def _extract_files_image(self, page, files): + for image in text.extract_iter( + page, 'class="file-player-image"', "</figure>"): + size = text.extr(image, ' data-size="', '"') + w, _, h = size.partition("x") + + files.append({ + "url" : text.extr(image, ' data-raw="', '"'), + "width" : text.parse_int(w), + "height": text.parse_int(h), + "type" : "image", + }) + + def _extract_files_video(self, page, files): + for video in text.extract_iter( + page, "<vue-file-player", "</vue-file-player>"): + path = text.extr(video, ' base-path="', '"') + name = text.extr(video, ' file-name="', '"') + auth = text.extr(video, ' auth-key="', '"') + + file = text.nameext_from_url(name) + file["url"] = "{}video-web.mp4?{}".format(path, auth) + file["type"] = "video" + files.append(file) + + def _extract_files_download(self, page, files): + for download in text.extract_iter( + page, 'class="downloadBlock', "</div>"): + name = text.extr(download, "<p>", "<") + + file = text.nameext_from_url(name.rpartition(" ")[0]) + file["url"] = text.extr(download, ' href="', '"') + file["type"] = "download" + files.append(file) + + def _extract_files_gallery(self, page, files): + for gallery in text.extract_iter( + page, "<vue-image-gallery", "</vue-image-gallery>"): + + url = self.root + "/api/creator/gallery/images" + params = { + "hash" : text.extr(gallery, ' hash="', '"'), + "gallery_id": text.extr(gallery, ' gallery-id="', '"'), + "time" : text.extr(gallery, ' time="', '"'), + } + data = self.request(url, params=params).json() + url = self.root + "/api/creator/gallery/imagePath" + + for params["page"], params["file_id"] in enumerate( + data["imgList"]): + path = self.request(url, params=params).json()["path"] + + file = params.copy() + file["url"] = path + files.append(file) + + +class CienCreatorExtractor(CienExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + example = "https://ci-en.net/creator/123" + + def items(self): + url = "{}/creator/{}/article".format(self.root, self.groups[0]) + params = text.parse_query(self.groups[1]) + params["mode"] = "list" + return self._pagination_articles(url, params) + + +class CienRecentExtractor(CienExtractor): + subcategory = "recent" + pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + example = "https://ci-en.net/mypage/recent" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) + + +class CienFollowingExtractor(CienExtractor): + subcategory = "following" + pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + example = "https://ci-en.net/mypage/subscription" + + def items(self): + url = self.root + "/mypage/subscription" + (self.groups[0] or "") + page = self.request(url).text + data = {"_extractor": CienCreatorExtractor} + + for subscription in text.extract_iter( + page, 'class="c-grid-subscriptionInfo', '</figure>'): + url = text.extr(subscription, ' href="', '"') + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index d7a41bc..df70571 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -378,7 +378,7 @@ class Extractor(): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:109.0) Gecko/20100101 Firefox/115.0") + "rv:128.0) Gecko/20100101 Firefox/128.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -390,6 +390,8 @@ class Extractor(): headers["Accept-Encoding"] = "gzip, deflate, br" else: headers["Accept-Encoding"] = "gzip, deflate" + if ZSTD: + headers["Accept-Encoding"] += ", zstd" referer = self.config("referer", self.referer) if referer: @@ -789,10 +791,11 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - Extractor.__init__(self, match) if not self.category: + self.groups = match.groups() + self.match = match self._init_category() - self._cfgpath = ("extractor", self.category, self.subcategory) + Extractor.__init__(self, match) def _init_category(self): for index, group in enumerate(self.groups): @@ -911,13 +914,12 @@ _browser_cookies = {} HTTP_HEADERS = { "firefox": ( ("User-Agent", "Mozilla/5.0 ({}; " - "rv:109.0) Gecko/20100101 Firefox/115.0"), + "rv:128.0) Gecko/20100101 Firefox/128.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8"), + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), ("Accept-Encoding", None), ("Referer", None), - ("DNT", "1"), ("Connection", "keep-alive"), ("Upgrade-Insecure-Requests", "1"), ("Cookie", None), @@ -991,6 +993,12 @@ try: except AttributeError: BROTLI = False +# detect zstandard support +try: + ZSTD = urllib3.response.HAS_ZSTD +except AttributeError: + ZSTD = False + # set (urllib3) warnings filter action = config.get((), "warnings", "default") if action: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2199cc8..a70710c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -846,55 +846,6 @@ class DeviantartStatusExtractor(DeviantartExtractor): ) -class DeviantartPopularExtractor(DeviantartExtractor): - """Extractor for popular deviations""" - subcategory = "popular" - directory_fmt = ("{category}", "Popular", - "{popular[range]}", "{popular[search]}") - archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" - pattern = (r"(?:https?://)?www\.deviantart\.com/(?:" - r"(?:deviations/?)?\?order=(popular-[^/?#]+)" - r"|((?:[\w-]+/)*)(popular-[^/?#]+)" - r")/?(?:\?([^#]*))?") - example = "https://www.deviantart.com/popular-24-hours/" - - def __init__(self, match): - DeviantartExtractor.__init__(self, match) - self.user = "" - - trange1, path, trange2, query = match.groups() - query = text.parse_query(query) - self.search_term = query.get("q") - - trange = trange1 or trange2 or query.get("order", "") - if trange.startswith("popular-"): - trange = trange[8:] - self.time_range = { - "newest" : "now", - "most-recent" : "now", - "this-week" : "1week", - "this-month" : "1month", - "this-century": "alltime", - "all-time" : "alltime", - }.get(trange, "alltime") - - self.popular = { - "search": self.search_term or "", - "range" : trange or "all-time", - "path" : path.strip("/") if path else "", - } - - def deviations(self): - if self.time_range == "now": - return self.api.browse_newest(self.search_term, self.offset) - return self.api.browse_popular( - self.search_term, self.time_range, self.offset) - - def prepare(self, deviation): - DeviantartExtractor.prepare(self, deviation) - deviation["popular"] = self.popular - - class DeviantartTagExtractor(DeviantartExtractor): """Extractor for deviations from tag searches""" subcategory = "tag" @@ -1077,14 +1028,14 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): class DeviantartFollowingExtractor(DeviantartExtractor): """Extractor for user's watched users""" subcategory = "following" - pattern = BASE_PATTERN + "/about#watching$" + pattern = BASE_PATTERN + "/(?:about#)?watching" example = "https://www.deviantart.com/USER/about#watching" def items(self): - eclipse_api = DeviantartEclipseAPI(self) + api = DeviantartOAuthAPI(self) - for user in eclipse_api.user_watching(self.user, self.offset): - url = "{}/{}".format(self.root, user["username"]) + for user in api.user_friends(self.user): + url = "{}/{}".format(self.root, user["user"]["username"]) user["_extractor"] = DeviantartUserExtractor yield Message.Queue, url, user @@ -1095,7 +1046,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor): class DeviantartOAuthAPI(): """Interface for the DeviantArt OAuth API - Ref: https://www.deviantart.com/developers/http/v1/20160316 + https://www.deviantart.com/developers/http/v1/20160316 """ CLIENT_ID = "5388" CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1" @@ -1188,29 +1139,6 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination(endpoint, params, public=False, unpack=True) - def browse_newest(self, query=None, offset=0): - """Browse newest deviations""" - endpoint = "/browse/newest" - params = { - "q" : query, - "limit" : 120, - "offset" : offset, - "mature_content": self.mature, - } - return self._pagination(endpoint, params) - - def browse_popular(self, query=None, timerange=None, offset=0): - """Yield popular deviations""" - endpoint = "/browse/popular" - params = { - "q" : query, - "limit" : 120, - "timerange" : timerange, - "offset" : offset, - "mature_content": self.mature, - } - return self._pagination(endpoint, params) - def browse_tags(self, tag, offset=0): """ Browse a tag """ endpoint = "/browse/tags" @@ -1223,11 +1151,12 @@ class DeviantartOAuthAPI(): return self._pagination(endpoint, params) def browse_user_journals(self, username, offset=0): - """Yield all journal entries of a specific user""" - endpoint = "/browse/user/journals" - params = {"username": username, "offset": offset, "limit": 50, - "mature_content": self.mature, "featured": "false"} - return self._pagination(endpoint, params) + journals = filter( + lambda post: "/journal/" in post["url"], + self.user_profile_posts(username)) + if offset: + journals = util.advance(journals, offset) + return journals def collections(self, username, folder_id, offset=0): """Yield all Deviation-objects contained in a collection folder""" @@ -1339,16 +1268,10 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - @memcache(keyarg=1) - def user_profile(self, username): - """Get user profile information""" - endpoint = "/user/profile/" + username - return self._call(endpoint, fatal=False) - - def user_statuses(self, username, offset=0): - """Yield status updates of a specific user""" - endpoint = "/user/statuses/" - params = {"username": username, "offset": offset, "limit": 50} + def user_friends(self, username, offset=0): + """Get the users list of friends""" + endpoint = "/user/friends/" + username + params = {"limit": 50, "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params) def user_friends_watch(self, username): @@ -1376,6 +1299,27 @@ class DeviantartOAuthAPI(): endpoint, method="POST", public=False, fatal=False, ).get("success") + @memcache(keyarg=1) + def user_profile(self, username): + """Get user profile information""" + endpoint = "/user/profile/" + username + return self._call(endpoint, fatal=False) + + def user_profile_posts(self, username): + endpoint = "/user/profile/posts" + params = {"username": username, "limit": 50, + "mature_content": self.mature} + return self._pagination(endpoint, params) + + def user_statuses(self, username, offset=0): + """Yield status updates of a specific user""" + statuses = filter( + lambda post: "/status-update/" in post["url"], + self.user_profile_posts(username)) + if offset: + statuses = util.advance(statuses, offset) + return statuses + def authenticate(self, refresh_token_key): """Authenticate the application by requesting an access token""" self.headers["Authorization"] = \ @@ -1464,7 +1408,7 @@ class DeviantartOAuthAPI(): self.log.error(msg) return data - def _switch_tokens(self, results, params): + def _should_switch_tokens(self, results, params): if len(results) < params["limit"]: return True @@ -1496,7 +1440,7 @@ class DeviantartOAuthAPI(): results = [item["journal"] for item in results if "journal" in item] if extend: - if public and self._switch_tokens(results, params): + if public and self._should_switch_tokens(results, params): if self.refresh_token_key: self.log.debug("Switching to private access token") public = False @@ -1540,6 +1484,11 @@ class DeviantartOAuthAPI(): return params["offset"] = int(params["offset"]) + len(results) + def _pagination_list(self, endpoint, params, key="results"): + result = [] + result.extend(self._pagination(endpoint, params, False, key=key)) + return result + @staticmethod def _shared_content(results): """Return an iterable of shared deviations in 'results'""" @@ -1548,11 +1497,6 @@ class DeviantartOAuthAPI(): if "deviation" in item: yield item["deviation"] - def _pagination_list(self, endpoint, params, key="results"): - result = [] - result.extend(self._pagination(endpoint, params, False, key=key)) - return result - def _metadata(self, deviations): """Add extended metadata to each deviation object""" if len(deviations) <= self.limit: diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 26f2184..2f0230a 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -18,7 +18,8 @@ class DirectlinkExtractor(Extractor): filename_fmt = "{domain}/{path}/{filename}.{extension}" archive_fmt = filename_fmt pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\." - r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" + r"(?:jpe?g|jpe|png|gif|bmp|svg|web[mp]|avif|heic|psd" + r"|mp4|m4v|mov|mkv|og[gmv]|wav|mp3|opus|zip|rar|7z|pdf|swf))" r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$") example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png" diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 733d0d8..583869f 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -66,6 +66,8 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): text.extr(group, ' alt="', '"')), "date" : text.parse_datetime(extr( '"icon-calendar"></i> ', '<'), "%b %d, %Y"), + "tags" : text.split_html(extr( + "class='tags'>", "<div id='chapter-actions'")), "lang" : "en", "language": "English", } diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 8c9da2f..e6d136f 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -46,18 +46,24 @@ class EromeExtractor(Extractor): page, 'href="https://www.erome.com/', '"', pos) urls = [] + date = None groups = page.split('<div class="media-group"') for group in util.advance(groups, 1): url = (text.extr(group, '<source src="', '"') or text.extr(group, 'data-src="', '"')) if url: urls.append(url) + if not date: + ts = text.extr(group, '?v=', '"') + if len(ts) > 1: + date = text.parse_timestamp(ts) data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "count" : len(urls), + "date" : date, "_http_headers": {"Referer": url}, } diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 1805403..1b4f995 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -394,6 +394,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.original = False return self.data["_url_1280"] + if " temporarily banned " in page: + raise exception.AuthorizationError("Temporarily Banned") + self._report_limits() return True diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py deleted file mode 100644 index 650a707..0000000 --- a/gallery_dl/extractor/fallenangels.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2017-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.fascans.com/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text, util - - -class FallenangelsChapterExtractor(ChapterExtractor): - """Extractor for manga chapters from fascans.com""" - category = "fallenangels" - pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com" - r"/manga/([^/?#]+)/([^/?#]+)") - example = "https://manga.fascans.com/manga/NAME/CHAPTER/" - - def __init__(self, match): - self.version, self.manga, self.chapter = match.groups() - url = "https://{}.fascans.com/manga/{}/{}/1".format( - self.version, self.manga, self.chapter) - ChapterExtractor.__init__(self, match, url) - - def metadata(self, page): - extr = text.extract_from(page) - lang = "vi" if self.version == "truyen" else "en" - chapter, sep, minor = self.chapter.partition(".") - return { - "manga" : extr('name="description" content="', ' Chapter '), - "title" : extr(': ', ' - Page 1'), - "chapter" : chapter, - "chapter_minor": sep + minor, - "lang" : lang, - "language": util.code_to_language(lang), - } - - @staticmethod - def images(page): - return [ - (img["page_image"], None) - for img in util.json_loads( - text.extr(page, "var pages = ", ";") - ) - ] - - -class FallenangelsMangaExtractor(MangaExtractor): - """Extractor for manga from fascans.com""" - chapterclass = FallenangelsChapterExtractor - category = "fallenangels" - pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" - example = "https://manga.fascans.com/manga/NAME" - - def __init__(self, match): - url = "https://" + match.group(1) - self.lang = "vi" if match.group(2) == "truyen" else "en" - MangaExtractor.__init__(self, match, url) - - def chapters(self, page): - extr = text.extract_from(page) - results = [] - language = util.code_to_language(self.lang) - while extr('<li style="', '"'): - vol = extr('class="volume-', '"') - url = extr('href="', '"') - cha = extr('>', '<') - title = extr('<em>', '</em>') - - manga, _, chapter = cha.rpartition(" ") - chapter, dot, minor = chapter.partition(".") - results.append((url, { - "manga" : manga, - "title" : text.unescape(title), - "volume" : text.parse_int(vol), - "chapter" : text.parse_int(chapter), - "chapter_minor": dot + minor, - "lang" : self.lang, - "language": language, - })) - return results diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 6040187..f48a984 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -113,6 +113,12 @@ class FuraffinityExtractor(Extractor): data["gender"] = rh(extr('>Gender</strong>', '</div>')) data["width"] = pi(extr("<span>", "x")) data["height"] = pi(extr("", "p")) + data["folders"] = folders = [] + for folder in extr( + "<h3>Listed in Folders</h3>", "</section>").split("</a>"): + folder = rh(folder) + if folder: + folders.append(folder) else: # old site layout data["title"] = text.unescape(extr("<h2>", "</h2>")) @@ -132,11 +138,14 @@ class FuraffinityExtractor(Extractor): data["_description"] = extr( '<td valign="top" align="left" width="70%" class="alt1" ' 'style="padding:8px">', ' </td>') + data["folders"] = () # folders not present in old layout data["artist_url"] = data["artist"].replace("_", "").lower() data["user"] = self.user or data["artist_url"] data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) + data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format( + post_id, path.rsplit("/", 2)[1]) return data diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 8d8b8ad..fbbd26c 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -36,7 +36,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start params["limit"] = self.per_page - post = None + post = total = None + count = 0 + while True: try: root = self._api_request(params) @@ -50,12 +52,29 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = 0 continue + if total is None: + try: + total = int(root.attrib["count"]) + self.log.debug("%s posts in total", total) + except Exception as exc: + total = 0 + self.log.debug( + "Failed to get total number of posts (%s: %s)", + exc.__class__.__name__, exc) + post = None for post in root: yield post.attrib - if len(root) < self.per_page: - return + num = len(root) + count += num + if num < self.per_page: + if not total or count >= total: + return + if not num: + self.log.debug("Empty response - Retrying") + continue + params["pid"] += 1 def _pagination_html(self, params): diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 97b7844..286ee38 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -70,10 +70,13 @@ class HentainexusGalleryExtractor(GalleryExtractor): for img in imgs: img["_http_headers"] = headers - return [ - (img["image"], img) - for img in imgs - ] + results = [] + for img in imgs: + try: + results.append((img["image"], img)) + except KeyError: + pass + return results @staticmethod def _decode(data): diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index a2b51be..34fbabd 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,12 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + if self.type == "photo": + post["url"] = ( + post["url"] + .replace("/storage/storage/", "/storage/") + .replace("_thumb.", ".") + ) post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 85446c0..345f51d 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -19,7 +19,7 @@ class ImagefapExtractor(Extractor): category = "imagefap" root = "https://www.imagefap.com" directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{filename}.{extension}" + filename_fmt = "{category}_{gallery_id}_{num:04}_{filename}.{extension}" archive_fmt = "{gallery_id}_{image_id}" request_interval = (2.0, 4.0) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 2ae8cbe..f3098f1 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -246,14 +246,12 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor): data = {"_extractor": InkbunnyUserExtractor} while True: - cnt = 0 for user in text.extract_iter( page, '<a class="widget_userNameSmall" href="', '"', page.index('id="changethumboriginal_form"')): - cnt += 1 yield Message.Queue, self.root + user, data - if cnt < 20: + if "<a title='next page' " not in page: return params["page"] += 1 page = self.request(url, params=params).text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index f7a5cc7..dbe2df3 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -596,6 +596,22 @@ class InstagramTagExtractor(InstagramExtractor): return self.api.tags_media(self.item) +class InstagramInfoExtractor(InstagramExtractor): + """Extractor for an Instagram user's profile data""" + subcategory = "info" + pattern = USER_PATTERN + r"/info" + example = "https://www.instagram.com/USER/info/" + + def items(self): + screen_name = self.item + if screen_name.startswith("id:"): + user = self.api.user_by_id(screen_name[3:]) + else: + user = self.api.user_by_name(screen_name) + + return iter(((Message.Directory, user),)) + + class InstagramAvatarExtractor(InstagramExtractor): """Extractor for an Instagram user's avatar""" subcategory = "avatar" @@ -975,9 +991,9 @@ class InstagramGraphqlAPI(): if not info["has_next_page"]: return extr._update_cursor(None) elif not data["edges"]: - s = "" if self.item.endswith("s") else "s" + s = "" if self.extractor.item.endswith("s") else "s" raise exception.StopExtraction( - "%s'%s posts are private", self.item, s) + "%s'%s posts are private", self.extractor.item, s) variables["after"] = extr._update_cursor(info["end_cursor"]) diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py new file mode 100644 index 0000000..979b1a2 --- /dev/null +++ b/gallery_dl/extractor/koharu.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://koharu.to/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +from ..cache import cache + +BASE_PATTERN = r"(?i)(?:https?://)?(?:koharu|anchira)\.to" + + +class KoharuExtractor(Extractor): + """Base class for koharu extractors""" + category = "koharu" + root = "https://koharu.to" + root_api = "https://api.koharu.to" + request_interval = (0.5, 1.5) + + def _init(self): + self.headers = { + "Accept" : "*/*", + "Referer": self.root + "/", + "Origin" : self.root, + } + + def _pagination(self, endpoint, params): + url_api = self.root_api + endpoint + + while True: + data = self.request( + url_api, params=params, headers=self.headers).json() + + try: + entries = data["entries"] + except KeyError: + return + + for entry in entries: + url = "{}/g/{}/{}".format( + self.root, entry["id"], entry["public_key"]) + entry["_extractor"] = KoharuGalleryExtractor + yield Message.Queue, url, entry + + try: + if data["limit"] * data["page"] >= data["total"]: + return + except Exception: + pass + params["page"] += 1 + + +class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): + """Extractor for koharu galleries""" + filename_fmt = "{num:>03}.{extension}" + directory_fmt = ("{category}", "{id} {title}") + archive_fmt = "{id}_{num}" + request_interval = 0.0 + pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)" + example = "https://koharu.to/g/12345/67890abcde/" + + TAG_TYPES = { + 0 : "general", + 1 : "artist", + 2 : "circle", + 3 : "parody", + 4 : "magazine", + 5 : "character", + 6 : "", + 7 : "uploader", + 8 : "male", + 9 : "female", + 10: "mixed", + 11: "language", + 12: "other", + } + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_url = None + + def _init(self): + self.headers = { + "Accept" : "*/*", + "Referer": self.root + "/", + "Origin" : self.root, + } + + self.fmt = self.config("format") + self.cbz = self.config("cbz", True) + + if self.cbz: + self.filename_fmt = "{id} {title}.{extension}" + self.directory_fmt = ("{category}",) + + def metadata(self, _): + url = "{}/books/detail/{}/{}".format( + self.root_api, self.groups[0], self.groups[1]) + self.data = data = self.request(url, headers=self.headers).json() + + tags = [] + for tag in data["tags"]: + name = tag["name"] + namespace = tag.get("namespace", 0) + tags.append(self.TAG_TYPES[namespace] + ":" + name) + data["tags"] = tags + data["date"] = text.parse_timestamp(data["created_at"] // 1000) + + try: + if self.cbz: + data["count"] = len(data["thumbnails"]["entries"]) + del data["thumbnails"] + del data["rels"] + except Exception: + pass + + return data + + def images(self, _): + data = self.data + fmt = self._select_format(data["data"]) + + url = "{}/books/data/{}/{}/{}/{}".format( + self.root_api, + data["id"], data["public_key"], + fmt["id"], fmt["public_key"], + ) + params = { + "v": data["updated_at"], + "w": fmt["w"], + } + + if self.cbz: + params["action"] = "dl" + base = self.request( + url, method="POST", params=params, headers=self.headers, + ).json()["base"] + url = "{}?v={}&w={}".format(base, data["updated_at"], fmt["w"]) + info = text.nameext_from_url(base) + if not info["extension"]: + info["extension"] = "cbz" + return ((url, info),) + + data = self.request(url, params=params, headers=self.headers).json() + base = data["base"] + + results = [] + for entry in data["entries"]: + dimensions = entry["dimensions"] + info = { + "w": dimensions[0], + "h": dimensions[1], + "_http_headers": self.headers, + } + results.append((base + entry["path"], info)) + return results + + def _select_format(self, formats): + if not self.fmt or self.fmt == "original": + fmtid = "0" + else: + fmtid = str(self.fmt) + + try: + fmt = formats[fmtid] + except KeyError: + raise exception.NotFoundError("format") + + fmt["w"] = fmtid + return fmt + + +class KoharuSearchExtractor(KoharuExtractor): + """Extractor for koharu search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/\?([^#]*)" + example = "https://koharu.to/?s=QUERY" + + def items(self): + params = text.parse_query(self.groups[0]) + params["page"] = text.parse_int(params.get("page"), 1) + return self._pagination("/books", params) + + +class KoharuFavoriteExtractor(KoharuExtractor): + """Extractor for koharu favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + example = "https://koharu.to/favorites" + + def items(self): + self.login() + + params = text.parse_query(self.groups[0]) + params["page"] = text.parse_int(params.get("page"), 1) + return self._pagination("/favorites", params) + + def login(self): + username, password = self._get_auth_info() + if username: + self.headers["Authorization"] = \ + "Bearer " + self._login_impl(username, password) + return + + raise exception.AuthenticationError("Username and password required") + + @cache(maxage=86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = "https://auth.koharu.to/login" + data = {"uname": username, "passwd": password} + response = self.request( + url, method="POST", headers=self.headers, data=data) + + return response.json()["session"] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 60cca22..b01c591 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -120,7 +120,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): ] else: pos = page.find('id="view-center"') + 1 - return (text.extr(page, 'itemprop="image" src="', '"', pos),) + # do NOT use text.extr() here, as it doesn't support a pos argument + return (text.extract(page, 'itemprop="image" src="', '"', pos)[0],) @staticmethod def _extract_user_name(page): diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index b21e1eb..2330b08 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -77,6 +77,7 @@ class PahealTagExtractor(PahealExtractor): pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") example = "https://rule34.paheal.net/post/list/TAG/1" + page_start = 1 per_page = 70 def __init__(self, match): @@ -87,11 +88,16 @@ class PahealTagExtractor(PahealExtractor): if self.config("metadata"): self._extract_data = self._extract_data_ex + def skip(self, num): + pages = num // self.per_page + self.page_start += pages + return pages * self.per_page + def get_metadata(self): return {"search_tags": self.tags} def get_posts(self): - pnum = 1 + pnum = self.page_start while True: url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 115de9a..271fa50 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -78,12 +78,16 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): } def images(self, page): - return [ - (beau(url), None) - for url in text.extract_iter( - page, "lstImages.push('", "'", - ) - ] + results = [] + + for block in page.split(" pth = '")[1:]: + pth = text.extr(block, "", "'") + for needle, repl in re.findall( + r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block): + pth = pth.replace(needle, repl) + results.append((beau(pth), None)) + + return results class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): @@ -116,9 +120,9 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): def beau(url): - """https://readcomiconline.li/Scripts/rguard.min.js""" - url = url.replace("_x236", "d") - url = url.replace("_x945", "g") + """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.1""" + url = url.replace("pw_.g28x", "b") + url = url.replace("d2pr.x_27", "h") if url.startswith("https"): return url @@ -126,8 +130,8 @@ def beau(url): url, sep, rest = url.partition("?") containsS0 = "=s0" in url url = url[:-3 if containsS0 else -6] - url = url[4:22] + url[25:] - url = url[0:-6] + url[-2:] + url = url[15:33] + url[50:] + url = url[0:-11] + url[-2:] url = binascii.a2b_base64(url).decode() url = url[0:13] + url[17:] url = url[0:-2] + ("=s0" if containsS0 else "=s1600") diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 327bcd1..506f6ac 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -190,7 +190,7 @@ class RedgifsImageExtractor(RedgifsExtractor): r"(?:\w+\.)?redgifs\.com/(?:watch|ifr)|" r"(?:\w+\.)?gfycat\.com(?:/gifs/detail|/\w+)?|" r"(?:www\.)?gifdeliverynetwork\.com|" - r"i\.redgifs\.com/i)/([A-Za-z]+)") + r"i\.redgifs\.com/i)/([A-Za-z0-9]+)") example = "https://redgifs.com/watch/ID" def gifs(self): diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index caf3e16..ad3efa7 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -16,7 +16,7 @@ import collections import re BASE_PATTERN = r"(?:https?://)?" \ - r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ + r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ r"(?:/[a-z]{2})?" @@ -45,6 +45,9 @@ class SankakuExtractor(BooruExtractor): def skip(self, num): return 0 + def _init(self): + self.api = SankakuAPI(self) + def _file_url(self, post): url = post["file_url"] if not url: @@ -81,6 +84,15 @@ class SankakuExtractor(BooruExtractor): post["tags_" + key] = value post["tag_string_" + key] = " ".join(value) + def _notes(self, post, page): + if post.get("has_notes"): + post["notes"] = self.api.notes(post["id"]) + for note in post["notes"]: + note["created_at"] = note["created_at"]["s"] + note["updated_at"] = note["updated_at"]["s"] + else: + post["notes"] = () + class SankakuTagExtractor(SankakuExtractor): """Extractor for images from sankaku.app by search-tags""" @@ -109,7 +121,7 @@ class SankakuTagExtractor(SankakuExtractor): def posts(self): params = {"tags": self.tags} - return SankakuAPI(self).posts_keyset(params) + return self.api.posts_keyset(params) class SankakuPoolExtractor(SankakuExtractor): @@ -125,7 +137,7 @@ class SankakuPoolExtractor(SankakuExtractor): self.pool_id = match.group(1) def metadata(self): - pool = SankakuAPI(self).pools(self.pool_id) + pool = self.api.pools(self.pool_id) pool["tags"] = [tag["name"] for tag in pool["tags"]] pool["artist_tags"] = [tag["name"] for tag in pool["artist_tags"]] @@ -151,7 +163,7 @@ class SankakuPostExtractor(SankakuExtractor): self.post_id = match.group(1) def posts(self): - return SankakuAPI(self).posts(self.post_id) + return self.api.posts(self.post_id) class SankakuBooksExtractor(SankakuExtractor): @@ -167,7 +179,7 @@ class SankakuBooksExtractor(SankakuExtractor): def items(self): params = {"tags": self.tags, "pool_type": "0"} - for pool in SankakuAPI(self).pools_keyset(params): + for pool in self.api.pools_keyset(params): pool["_extractor"] = SankakuPoolExtractor url = "https://sankaku.app/books/{}".format(pool["id"]) yield Message.Queue, url, pool @@ -192,6 +204,10 @@ class SankakuAPI(): if not self.username: self.authenticate = util.noop + def notes(self, post_id): + params = {"lang": "en"} + return self._call("/posts/{}/notes".format(post_id), params) + def pools(self, pool_id): params = {"lang": "en"} return self._call("/pools/" + pool_id, params) diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index e1d4153..50c21e3 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://www.sankakucomplex.com/""" +"""Extractors for https://news.sankakucomplex.com/""" from .common import Extractor, Message from .. import text, util @@ -16,7 +16,7 @@ import re class SankakucomplexExtractor(Extractor): """Base class for sankakucomplex extractors""" category = "sankakucomplex" - root = "https://www.sankakucomplex.com" + root = "https://news.sankakucomplex.com" def __init__(self, match): Extractor.__init__(self, match) @@ -24,14 +24,14 @@ class SankakucomplexExtractor(Extractor): class SankakucomplexArticleExtractor(SankakucomplexExtractor): - """Extractor for articles on www.sankakucomplex.com""" + """Extractor for articles on news.sankakucomplex.com""" subcategory = "article" directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{date:%Y%m%d}_{filename}" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com" r"/(\d\d\d\d/\d\d/\d\d/[^/?#]+)") - example = "https://www.sankakucomplex.com/1970/01/01/TITLE" + example = "https://news.sankakucomplex.com/1970/01/01/TITLE" def items(self): url = "{}/{}/?pg=X".format(self.root, self.path) @@ -87,9 +87,9 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): class SankakucomplexTagExtractor(SankakucomplexExtractor): """Extractor for sankakucomplex blog articles by tag or author""" subcategory = "tag" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com" r"/((?:tag|category|author)/[^/?#]+)") - example = "https://www.sankakucomplex.com/tag/TAG/" + example = "https://news.sankakucomplex.com/tag/TAG/" def items(self): pnum = 1 diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 0abb3ab..7c760ac 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -127,6 +127,8 @@ class SubscribestarExtractor(Extractor): } def _parse_datetime(self, dt): + if dt.startswith("Updated on "): + dt = dt[11:] date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p") if date is dt: date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p") diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 78ff265..64fa951 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -77,23 +77,27 @@ class ToyhouseExtractor(Extractor): cnt += 1 yield self._parse_post(post) - if cnt == 0 and params["page"] == 1: - token, pos = text.extract( - page, '<input name="_token" type="hidden" value="', '"') - if not token: - return - data = { - "_token": token, - "user" : text.extract(page, 'value="', '"', pos)[0], - } - self.request(self.root + "/~account/warnings/accept", - method="POST", data=data, allow_redirects=False) - continue + if not cnt and params["page"] == 1: + if self._accept_content_warning(page): + continue + return if cnt < 18: return params["page"] += 1 + def _accept_content_warning(self, page): + pos = page.find(' name="_token"') + 1 + token, pos = text.extract(page, ' value="', '"', pos) + user , pos = text.extract(page, ' value="', '"', pos) + if not token or not user: + return False + + data = {"_token": token, "user": user} + self.request(self.root + "/~account/warnings/accept", + method="POST", data=data, allow_redirects=False) + return True + class ToyhouseArtExtractor(ToyhouseExtractor): """Extractor for artworks of a toyhouse user""" diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index c34910f..ff29c04 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -386,7 +386,7 @@ class TumblrAPI(oauth.OAuth1API): def posts(self, blog, params): """Retrieve published posts""" params["offset"] = self.extractor.config("offset") - params["limit"] = "50" + params["limit"] = 50 params["reblog_info"] = "true" params["type"] = self.posts_type params["before"] = self.before @@ -398,8 +398,14 @@ class TumblrAPI(oauth.OAuth1API): def likes(self, blog): """Retrieve liked posts""" + endpoint = "/v2/blog/{}/likes".format(blog) params = {"limit": "50", "before": self.before} - return self._pagination(blog, "/likes", params, key="liked_posts") + while True: + posts = self._call(endpoint, params)["liked_posts"] + if not posts: + return + yield from posts + params["before"] = posts[-1]["liked_timestamp"] def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint @@ -474,6 +480,7 @@ class TumblrAPI(oauth.OAuth1API): if self.api_key: params["api_key"] = self.api_key + strategy = self.extractor.config("pagination") while True: data = self._call(endpoint, params) @@ -481,13 +488,31 @@ class TumblrAPI(oauth.OAuth1API): self.BLOG_CACHE[blog] = data["blog"] cache = False - yield from data[key] - - try: - endpoint = data["_links"]["next"]["href"] - except KeyError: - return + posts = data[key] + yield from posts - params = None - if self.api_key: - endpoint += "&api_key=" + self.api_key + if strategy == "api": + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key + + elif strategy == "before": + if not posts: + return + timestamp = posts[-1]["timestamp"] + 1 + if params["before"] and timestamp >= params["before"]: + return + params["before"] = timestamp + params["offset"] = None + + else: # offset + params["offset"] = \ + text.parse_int(params["offset"]) + params["limit"] + params["before"] = None + if params["offset"] >= data["total_posts"]: + return diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ec098aa..9fa5b3f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -51,6 +51,8 @@ class TwitterExtractor(Extractor): if not self.config("transform", True): self._transform_user = util.identity self._transform_tweet = util.identity + + self._cursor = None self._user = None self._user_obj = None self._user_cache = {} @@ -321,8 +323,17 @@ class TwitterExtractor(Extractor): "quote_count" : tget("quote_count"), "reply_count" : tget("reply_count"), "retweet_count" : tget("retweet_count"), + "bookmark_count": tget("bookmark_count"), } + if "views" in tweet: + try: + tdata["view_count"] = int(tweet["views"]["count"]) + except Exception: + tdata["view_count"] = 0 + else: + tdata["view_count"] = 0 + if "note_tweet" in tweet: note = tweet["note_tweet"]["note_tweet_results"]["result"] content = note["text"] @@ -492,6 +503,14 @@ class TwitterExtractor(Extractor): }, } + def _init_cursor(self): + return self.config("cursor") or None + + def _update_cursor(self, cursor): + self.log.debug("Cursor: %s", cursor) + self._cursor = cursor + return cursor + def metadata(self): """Return general metadata""" return {} @@ -499,6 +518,11 @@ class TwitterExtractor(Extractor): def tweets(self): """Yield all relevant tweet objects""" + def finalize(self): + if self._cursor: + self.log.info("Use '-o cursor=%s' to continue downloading " + "from the current position", self._cursor) + def login(self): if self.cookies_check(self.cookies_names): return @@ -530,6 +554,9 @@ class TwitterUserExtractor(TwitterExtractor): def initialize(self): pass + def finalize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( @@ -549,30 +576,73 @@ class TwitterTimelineExtractor(TwitterExtractor): pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" example = "https://x.com/USER/timeline" + def _init_cursor(self): + if self._cursor: + return self._cursor.partition("/")[2] or None + return None + + def _update_cursor(self, cursor): + if cursor: + self._cursor = self._cursor_prefix + cursor + self.log.debug("Cursor: %s", self._cursor) + else: + self._cursor = None + return cursor + def tweets(self): - # yield initial batch of (media) tweets - tweet = None - for tweet in self._select_tweet_source()(self.user): - yield tweet - if tweet is None: - return + self._cursor = cursor = self.config("cursor") or None + reset = False - # build search query - query = "from:{} max_id:{}".format( - self._user["name"], tweet["rest_id"]) - if self.retweets: - query += " include:retweets include:nativeretweets" + if cursor: + state = cursor.partition("/")[0] + state, _, tweet_id = state.partition("_") + state = text.parse_int(state, 1) + else: + state = 1 + + if state <= 1: + self._cursor_prefix = "1/" - if not self.textonly: - # try to search for media-only tweets + # yield initial batch of (media) tweets tweet = None - for tweet in self.api.search_timeline(query + " filter:links"): + for tweet in self._select_tweet_source()(self.user): yield tweet - if tweet is not None: + if tweet is None and not cursor: return + tweet_id = tweet["rest_id"] + + state = reset = 2 + else: + self.api._user_id_by_screen_name(self.user) + + # build search query + query = "from:{} max_id:{}".format(self._user["name"], tweet_id) + if self.retweets: + query += " include:retweets include:nativeretweets" - # yield unfiltered search results - yield from self.api.search_timeline(query) + if state <= 2: + self._cursor_prefix = "2_{}/".format(tweet_id) + if reset: + self._cursor = self._cursor_prefix + + if not self.textonly: + # try to search for media-only tweets + tweet = None + for tweet in self.api.search_timeline(query + " filter:links"): + yield tweet + if tweet is not None: + return self._update_cursor(None) + + state = reset = 3 + + if state <= 3: + # yield unfiltered search results + self._cursor_prefix = "3_{}/".format(tweet_id) + if reset: + self._cursor = self._cursor_prefix + + yield from self.api.search_timeline(query) + return self._update_cursor(None) def _select_tweet_source(self): strategy = self.config("strategy") @@ -854,6 +924,24 @@ class TwitterQuotesExtractor(TwitterExtractor): yield Message.Queue, url, data +class TwitterInfoExtractor(TwitterExtractor): + """Extractor for a user's profile data""" + subcategory = "info" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/info" + example = "https://x.com/USER/info" + + def items(self): + api = TwitterAPI(self) + + screen_name = self.user + if screen_name.startswith("id:"): + user = api.user_by_rest_id(screen_name[3:]) + else: + user = api.user_by_screen_name(screen_name) + + return iter(((Message.Directory, self._transform_user(user)),)) + + class TwitterAvatarExtractor(TwitterExtractor): subcategory = "avatar" filename_fmt = "avatar {date}.{extension}" @@ -1388,7 +1476,11 @@ class TwitterAPI(): "%s %s (%s)", response.status_code, response.reason, errors) def _pagination_legacy(self, endpoint, params): - original_retweets = (self.extractor.retweets == "original") + extr = self.extractor + cursor = extr._init_cursor() + if cursor: + params["cursor"] = cursor + original_retweets = (extr.retweets == "original") bottom = ("cursor-bottom-", "sq-cursor-bottom") while True: @@ -1396,7 +1488,7 @@ class TwitterAPI(): instructions = data["timeline"]["instructions"] if not instructions: - return + return extr._update_cursor(None) tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] @@ -1477,8 +1569,8 @@ class TwitterAPI(): # stop on empty response if not cursor or (not tweets and not tweet_id): - return - params["cursor"] = cursor + return extr._update_cursor(None) + params["cursor"] = extr._update_cursor(cursor) def _pagination_tweets(self, endpoint, variables, path=None, stop_tweets=True, features=None): @@ -1487,6 +1579,9 @@ class TwitterAPI(): pinned_tweet = extr.pinned params = {"variables": None} + cursor = extr._init_cursor() + if cursor: + variables["cursor"] = cursor if features is None: features = self.features_pagination if features: @@ -1523,7 +1618,7 @@ class TwitterAPI(): cursor = entry["content"]["value"] if entries is None: if not cursor: - return + return extr._update_cursor(None) entries = () except LookupError: @@ -1672,12 +1767,16 @@ class TwitterAPI(): continue if stop_tweets and not tweet: - return + return extr._update_cursor(None) if not cursor or cursor == variables.get("cursor"): - return - variables["cursor"] = cursor + return extr._update_cursor(None) + variables["cursor"] = extr._update_cursor(cursor) def _pagination_users(self, endpoint, variables, path=None): + extr = self.extractor + cursor = extr._init_cursor() + if cursor: + variables["cursor"] = cursor params = { "variables": None, "features" : self._json_dumps(self.features_pagination), @@ -1697,7 +1796,7 @@ class TwitterAPI(): data = data[key] instructions = data["instructions"] except KeyError: - return + return extr._update_cursor(None) for instr in instructions: if instr["type"] == "TimelineAddEntries": @@ -1715,8 +1814,8 @@ class TwitterAPI(): cursor = entry["content"]["value"] if not cursor or cursor.startswith(("-1|", "0|")) or not entry: - return - variables["cursor"] = cursor + return extr._update_cursor(None) + variables["cursor"] = extr._update_cursor(cursor) def _handle_ratelimit(self, response): rl = self.extractor.config("ratelimit") @@ -1864,7 +1963,7 @@ def _login_impl(extr, username, password): }, } elif subtask == "LoginEnterAlternateIdentifierSubtask": - alt = extr.config("username_alt") or extr.input( + alt = extr.config("username-alt") or extr.input( "Alternate Identifier (username, email, phone number): ") data = { "enter_text": { diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 6dfb23c..5cde0d6 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -101,7 +101,8 @@ class VipergirlsExtractor(Extractor): class VipergirlsThreadExtractor(VipergirlsExtractor): """Extractor for vipergirls threads""" subcategory = "thread" - pattern = BASE_PATTERN + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?$" + pattern = (BASE_PATTERN + + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))") example = "https://vipergirls.to/threads/12345-TITLE" def __init__(self, match): diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index c112f4a..922a591 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -115,9 +115,28 @@ class VscoExtractor(Extractor): class VscoUserExtractor(VscoExtractor): - """Extractor for images from a user on vsco.co""" + """Extractor for a vsco user profile""" subcategory = "user" - pattern = USER_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" + pattern = USER_PATTERN + r"/?$" + example = "https://vsco.co/USER" + + def initialize(self): + pass + + def items(self): + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (VscoAvatarExtractor , base + "avatar"), + (VscoGalleryExtractor , base + "gallery"), + (VscoSpacesExtractor , base + "spaces"), + (VscoCollectionExtractor, base + "collection"), + ), ("gallery",)) + + +class VscoGalleryExtractor(VscoExtractor): + """Extractor for a vsco user's gallery""" + subcategory = "gallery" + pattern = USER_PATTERN + r"/(?:gallery|images)" example = "https://vsco.co/USER/gallery" def images(self): diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index faf3b0d..796f3f8 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -18,7 +18,7 @@ class WallpapercaveImageExtractor(Extractor): category = "wallpapercave" subcategory = "image" root = "https://wallpapercave.com" - pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com" + pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com/" example = "https://wallpapercave.com/w/wp12345" def items(self): @@ -40,3 +40,12 @@ class WallpapercaveImageExtractor(Extractor): image = text.nameext_from_url(path) yield Message.Directory, image yield Message.Url, self.root + path, image + + if path is None: + for wp in text.extract_iter( + page, 'class="wallpaper" id="wp', '</picture>'): + path = text.rextract(wp, ' src="', '"')[0] + if path: + image = text.nameext_from_url(path) + yield Message.Directory, image + yield Message.Url, self.root + path, image diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index e91f45f..61a36d5 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -64,7 +64,7 @@ class WarosuThreadExtractor(Extractor): def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "<span> File:" in post and self._extract_image(post, data): + if "<span class=fileinfo>" in post and self._extract_image(post, data): part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] @@ -83,7 +83,7 @@ class WarosuThreadExtractor(Extractor): def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("<span> File: ", ", ") + data["fsize"] = extr("<span class=fileinfo> File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") data["filename"] = text.unquote(extr( diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index fc61dff..126ef49 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -11,6 +11,8 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, util, exception +import collections +import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -76,22 +78,29 @@ class ZerochanExtractor(BooruExtractor): 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), - "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( - ">")[2] or None, + "source" : text.unescape(text.extr( + extr('id="source-url"', '</a>'), 'href="', '"')), } html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category = text.extr(tag, 'data-type="', '"') + category = text.extr(tag, '"', '"') name = text.extr(tag, 'data-tag="', '"') - tags.append(category.capitalize() + ":" + name) + tags.append(category.partition(" ")[0].capitalize() + ":" + name) return data def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - item = self.request(url).json() + text = self.request(url).text + try: + item = util.json_loads(text) + except ValueError as exc: + if " control character " not in str(exc): + raise + text = re.sub(r"[\x00-\x1f\x7f]", "", text) + item = util.json_loads(text) data = { "id" : item["id"], @@ -109,6 +118,14 @@ class ZerochanExtractor(BooruExtractor): return data + def _tags(self, post, page): + tags = collections.defaultdict(list) + for tag in post["tags"]: + category, _, name = tag.partition(":") + tags[category].append(name) + for key, value in tags.items(): + post["tags_" + key.lower()] = value + class ZerochanTagExtractor(ZerochanExtractor): subcategory = "tag" @@ -180,10 +197,16 @@ class ZerochanTagExtractor(ZerochanExtractor): static = "https://static.zerochan.net/.full." while True: - data = self.request(url, params=params).json() + response = self.request(url, params=params, allow_redirects=False) + if response.status_code >= 300: + url = text.urljoin(self.root, response.headers["location"]) + response = self.request(url, params=params) + data = response.json() + try: posts = data["items"] - except ValueError: + except Exception: + self.log.debug("Server response: %s", data) return if metadata: @@ -191,13 +214,13 @@ class ZerochanTagExtractor(ZerochanExtractor): post_id = post["id"] post.update(self._parse_entry_html(post_id)) post.update(self._parse_entry_api(post_id)) + yield post else: for post in posts: base = static + str(post["id"]) post["file_url"] = base + ".jpg" post["_fallback"] = (base + ".png",) - - yield from posts + yield post if not data.get("next"): return diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 4562b05..0e0916d 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -33,6 +33,7 @@ stdout_write = output.stdout_write class Job(): """Base class for Job types""" ulog = None + _logger_adapter = output.LoggerAdapter def __init__(self, extr, parent=None): if isinstance(extr, str): @@ -77,9 +78,9 @@ class Job(): actions = extr.config("actions") if actions: - from .actions import parse + from .actions import LoggerAdapter, parse + self._logger_adapter = LoggerAdapter self._logger_actions = parse(actions) - self._wrap_logger = self._wrap_logger_actions path_proxy = output.PathfmtProxy(self) self._logger_extra = { @@ -267,10 +268,7 @@ class Job(): return self._wrap_logger(logging.getLogger(name)) def _wrap_logger(self, logger): - return output.LoggerAdapter(logger, self) - - def _wrap_logger_actions(self, logger): - return output.LoggerAdapterActions(logger, self) + return self._logger_adapter(logger, self) def _write_unsupported(self, url): if self.ulog: @@ -315,7 +313,7 @@ class DownloadJob(Job): pathfmt.build_path() if pathfmt.exists(): - if archive: + if archive and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return @@ -345,7 +343,7 @@ class DownloadJob(Job): return if not pathfmt.temppath: - if archive: + if archive and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return @@ -359,7 +357,7 @@ class DownloadJob(Job): pathfmt.finalize() self.out.success(pathfmt.path) self._skipcnt = 0 - if archive: + if archive and self._archive_write_file: archive.add(kwdict) if "after" in hooks: for callback in hooks["after"]: @@ -561,6 +559,16 @@ class DownloadJob(Job): else: extr.log.debug("Using download archive '%s'", archive_path) + events = cfg("archive-event") + if events is None: + self._archive_write_file = True + self._archive_write_skip = False + else: + if isinstance(events, str): + events = events.split(",") + self._archive_write_file = ("file" in events) + self._archive_write_skip = ("skip" in events) + skip = cfg("skip", True) if skip: self._skipexc = None @@ -676,7 +684,7 @@ class SimulationJob(DownloadJob): kwdict["extension"] = "jpg" if self.sleep: self.extractor.sleep(self.sleep(), "download") - if self.archive: + if self.archive and self._archive_write_skip: self.archive.add(kwdict) self.out.skip(self.pathfmt.build_filename(kwdict)) @@ -848,16 +856,22 @@ class InfoJob(Job): class DataJob(Job): """Collect extractor results and dump them""" + resolve = False - def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True): + def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True, + resolve=False): Job.__init__(self, url, parent) self.file = file self.data = [] self.ascii = config.get(("output",), "ascii", ensure_ascii) + self.resolve = 128 if resolve is True else (resolve or self.resolve) private = config.get(("output",), "private") self.filter = dict.copy if private else util.filter_dict + if self.resolve > 0: + self.handle_queue = self.handle_queue_resolve + def run(self): self._init() @@ -883,12 +897,13 @@ class DataJob(Job): for msg in self.data: util.transform_dict(msg[-1], util.number_to_string) - # dump to 'file' - try: - util.dump_json(self.data, self.file, self.ascii, 2) - self.file.flush() - except Exception: - pass + if self.file: + # dump to 'file' + try: + util.dump_json(self.data, self.file, self.ascii, 2) + self.file.flush() + except Exception: + pass return 0 @@ -900,3 +915,17 @@ class DataJob(Job): def handle_queue(self, url, kwdict): self.data.append((Message.Queue, url, self.filter(kwdict))) + + def handle_queue_resolve(self, url, kwdict): + cls = kwdict.get("_extractor") + if cls: + extr = cls.from_url(url) + else: + extr = extractor.find(url) + + if not extr: + return self.data.append((Message.Queue, url, self.filter(kwdict))) + + job = self.__class__(extr, self, None, self.ascii, self.resolve-1) + job.data = self.data + job.run() diff --git a/gallery_dl/option.py b/gallery_dl/option.py index f31d5ac..155cbd9 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -293,10 +293,15 @@ def build_parser(): ) output.add_argument( "-j", "--dump-json", - dest="jobtype", action="store_const", const=job.DataJob, + dest="dump_json", action="count", help="Print JSON information", ) output.add_argument( + "-J", "--resolve-json", + dest="dump_json", action="store_const", const=128, + help="Print JSON information; resolve intermediary URLs", + ) + output.add_argument( "-s", "--simulate", dest="jobtype", action="store_const", const=job.SimulationJob, help="Simulate data extraction; do not download anything", @@ -346,6 +351,11 @@ def build_parser(): "in the current directory to debug problems"), ) output.add_argument( + "--print-traffic", + dest="print_traffic", action="store_true", + help=("Display sent and read HTTP traffic"), + ) + output.add_argument( "--no-colors", dest="colors", action="store_false", help=("Do not emit ANSI color codes in output"), diff --git a/gallery_dl/output.py b/gallery_dl/output.py index bd5d959..13b6a8a 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -10,7 +10,6 @@ import os import sys import shutil import logging -import functools import unicodedata from . import config, util, formatter @@ -92,39 +91,6 @@ class LoggerAdapter(): self.logger._log(logging.ERROR, msg, args, **kwargs) -class LoggerAdapterActions(): - - def __init__(self, logger, job): - self.logger = logger - self.extra = job._logger_extra - self.actions = job._logger_actions - - self.debug = functools.partial(self.log, logging.DEBUG) - self.info = functools.partial(self.log, logging.INFO) - self.warning = functools.partial(self.log, logging.WARNING) - self.error = functools.partial(self.log, logging.ERROR) - - def log(self, level, msg, *args, **kwargs): - msg = str(msg) - if args: - msg = msg % args - - actions = self.actions[level] - if actions: - args = self.extra.copy() - args["level"] = level - - for cond, action in actions: - if cond(msg): - action(args) - - level = args["level"] - - if self.logger.isEnabledFor(level): - kwargs["extra"] = self.extra - self.logger._log(level, msg, (), **kwargs) - - class PathfmtProxy(): __slots__ = ("job",) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 1616bbd..7892776 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -51,6 +51,7 @@ class PathFormat(): raise exception.FilenameFormatError(exc) directory_fmt = config("directory") + self.directory_conditions = () try: if directory_fmt is None: directory_fmt = extractor.directory_fmt @@ -266,7 +267,7 @@ class PathFormat(): try: for fmt in self.directory_formatters: segment = fmt(kwdict).strip() - if strip: + if strip and segment != "..": # remove trailing dots and spaces (#647) segment = segment.rstrip(strip) if segment: @@ -288,7 +289,7 @@ class PathFormat(): formatters = self.directory_formatters for fmt in formatters: segment = fmt(kwdict).strip() - if strip: + if strip and segment != "..": segment = segment.rstrip(strip) if segment: append(self.clean_segment(segment)) @@ -344,7 +345,11 @@ class PathFormat(): continue except OSError: # move across different filesystems - shutil.copyfile(self.temppath, self.realpath) + try: + shutil.copyfile(self.temppath, self.realpath) + except FileNotFoundError: + os.makedirs(self.realdirectory) + shutil.copyfile(self.temppath, self.realpath) os.unlink(self.temppath) break diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 18d00e1..a520a34 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -56,7 +56,13 @@ class MetadataPP(PostProcessor): ext = "json" directory = options.get("directory") - if directory: + if isinstance(directory, list): + self._directory = self._directory_format + self._directory_formatters = [ + formatter.parse(dirfmt, util.NONE).format_map + for dirfmt in directory + ] + elif directory: self._directory = self._directory_custom sep = os.sep + (os.altsep or "") self._metadir = util.expand_path(directory).rstrip(sep) + os.sep @@ -147,6 +153,19 @@ class MetadataPP(PostProcessor): def _directory_custom(self, pathfmt): return os.path.join(pathfmt.realdirectory, self._metadir) + def _directory_format(self, pathfmt): + formatters = pathfmt.directory_formatters + conditions = pathfmt.directory_conditions + try: + pathfmt.directory_formatters = self._directory_formatters + pathfmt.directory_conditions = () + segments = pathfmt.build_directory(pathfmt.kwdict) + directory = pathfmt.clean_path(os.sep.join(segments) + os.sep) + return os.path.join(pathfmt.realdirectory, directory) + finally: + pathfmt.directory_conditions = conditions + pathfmt.directory_formatters = formatters + def _filename(self, pathfmt): return (pathfmt.filename or "metadata") + "." + self.extension diff --git a/gallery_dl/util.py b/gallery_dl/util.py index e76ddf3..5744ef3 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -540,10 +540,14 @@ class CustomNone(): def __bool__(): return False + def __eq__(self, other): + return self is other + + def __ne__(self, other): + return self is not other + __lt__ = true __le__ = true - __eq__ = false - __ne__ = true __gt__ = false __ge__ = false @@ -616,11 +620,28 @@ else: Popen = subprocess.Popen -def compile_expression(expr, name="<expr>", globals=None): +def compile_expression_raw(expr, name="<expr>", globals=None): code_object = compile(expr, name, "eval") return functools.partial(eval, code_object, globals or GLOBALS) +def compile_expression_tryexcept(expr, name="<expr>", globals=None): + code_object = compile(expr, name, "eval") + + def _eval(locals=None, globals=(globals or GLOBALS), co=code_object): + try: + return eval(co, globals, locals) + except exception.GalleryDLException: + raise + except Exception: + return False + + return _eval + + +compile_expression = compile_expression_tryexcept + + def import_file(path): """Import a Python module from a filesystem path""" path, name = os.path.split(path) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index a8ff38e..f234af1 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.27.1" +__version__ = "1.27.2" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 0a0bf86..d4fdedc 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -249,6 +249,22 @@ def parse_command_line(module, argv): None if opts.match_filter is None else module.match_filter_func(opts.match_filter)) + cookiesfrombrowser = getattr(opts, "cookiesfrombrowser", None) + if cookiesfrombrowser: + match = re.fullmatch(r"""(?x) + (?P<name>[^+:]+) + (?:\s*\+\s*(?P<keyring>[^:]+))? + (?:\s*:\s*(?!:)(?P<profile>.+?))? + (?:\s*::\s*(?P<container>.+))? + """, cookiesfrombrowser) + if match: + browser, keyring, profile, container = match.groups() + if keyring is not None: + keyring = keyring.upper() + cookiesfrombrowser = (browser.lower(), profile, keyring, container) + else: + cookiesfrombrowser = None + return { "usenetrc": opts.usenetrc, "netrc_location": getattr(opts, "netrc_location", None), @@ -364,7 +380,7 @@ def parse_command_line(module, argv): "skip_playlist_after_errors": getattr( opts, "skip_playlist_after_errors", None), "cookiefile": opts.cookiefile, - "cookiesfrombrowser": getattr(opts, "cookiesfrombrowser", None), + "cookiesfrombrowser": cookiesfrombrowser, "nocheckcertificate": opts.no_check_certificate, "prefer_insecure": opts.prefer_insecure, "proxy": opts.proxy, |
