diff options
Diffstat (limited to 'gallery_dl/extractor')
38 files changed, 1019 insertions, 308 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index a4b0997..a5e8b27 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,7 +27,8 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + self.cookies.set( + "TOS20240718", "1", domain=self.root.rpartition("/")[2]) @memcache() def cookies_prepare(self): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6aff1f3..e103cb1 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -22,6 +22,7 @@ modules = [ "8chan", "8muses", "adultempire", + "agnph", "architizer", "artstation", "aryion", @@ -33,6 +34,7 @@ modules = [ "bunkr", "catbox", "chevereto", + "cien", "comicvine", "cyberdrop", "danbooru", @@ -42,7 +44,6 @@ modules = [ "e621", "erome", "exhentai", - "fallenangels", "fanbox", "fanleaks", "fantia", @@ -84,6 +85,7 @@ modules = [ "keenspot", "kemonoparty", "khinsider", + "koharu", "komikcast", "lensdump", "lexica", diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py new file mode 100644 index 0000000..653b73f --- /dev/null +++ b/gallery_dl/extractor/agnph.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://agn.ph/""" + +from . import booru +from .. import text + +from xml.etree import ElementTree +import collections +import re + +BASE_PATTERN = r"(?:https?://)?agn\.ph" + + +class AgnphExtractor(booru.BooruExtractor): + category = "agnph" + root = "https://agn.ph" + page_start = 1 + per_page = 45 + + TAG_TYPES = { + "a": "artist", + "b": "copyright", + "c": "character", + "d": "species", + "m": "general", + } + + def _init(self): + self.cookies.set("confirmed_age", "true", domain="agn.ph") + + def _prepare(self, post): + post["date"] = text.parse_timestamp(post["created_at"]) + post["status"] = post["status"].strip() + post["has_children"] = ("true" in post["has_children"]) + + def _xml_to_dict(self, xml): + return {element.tag: element.text for element in xml} + + def _pagination(self, url, params): + params["api"] = "xml" + if "page" in params: + params["page"] = \ + self.page_start + text.parse_int(params["page"]) - 1 + else: + params["page"] = self.page_start + + while True: + data = self.request(url, params=params).text + root = ElementTree.fromstring(data) + + yield from map(self._xml_to_dict, root) + + attrib = root.attrib + if int(attrib["offset"]) + len(root) >= int(attrib["count"]): + return + + params["page"] += 1 + + def _html(self, post): + url = "{}/gallery/post/show/{}/".format(self.root, post["id"]) + return self.request(url).text + + def _tags(self, post, page): + tag_container = text.extr( + page, '<ul class="taglist">', '<h3>Statistics</h3>') + if not tag_container: + return + + tags = collections.defaultdict(list) + pattern = re.compile(r'class="(.)typetag">([^<]+)') + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) + for key, value in tags.items(): + post["tags_" + self.TAG_TYPES[key]] = " ".join(value) + + +class AgnphTagExtractor(AgnphExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/gallery/post/(?:\?([^#]+))?$" + example = "https://agn.ph/gallery/post/?search=TAG" + + def __init__(self, match): + AgnphExtractor.__init__(self, match) + self.params = text.parse_query(self.groups[0]) + + def metadata(self): + return {"search_tags": self.params.get("search") or ""} + + def posts(self): + url = self.root + "/gallery/post/" + return self._pagination(url, self.params.copy()) + + +class AgnphPostExtractor(AgnphExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/gallery/post/show/(\d+)" + example = "https://agn.ph/gallery/post/show/12345/" + + def posts(self): + url = "{}/gallery/post/show/{}/?api=xml".format( + self.root, self.groups[0]) + post = ElementTree.fromstring(self.request(url).text) + return (self._xml_to_dict(post),) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index ec86263..17b780e 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -79,18 +79,20 @@ class AryionExtractor(Extractor): def metadata(self): """Return general metadata""" - def _pagination_params(self, url, params=None): + def _pagination_params(self, url, params=None, needle=None): if params is None: params = {"p": 1} else: params["p"] = text.parse_int(params.get("p"), 1) + if needle is None: + needle = "class='gallery-item' id='" + while True: page = self.request(url, params=params).text cnt = 0 - for post_id in text.extract_iter( - page, "class='gallery-item' id='", "'"): + for post_id in text.extract_iter(page, needle, "'"): cnt += 1 yield post_id @@ -200,6 +202,21 @@ class AryionGalleryExtractor(AryionExtractor): return util.advance(self._pagination_next(url), self.offset) +class AryionFavoriteExtractor(AryionExtractor): + """Extractor for a user's favorites gallery""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user!l}", "favorites") + archive_fmt = "f_{user}_{id}" + categorytransfer = True + pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" + example = "https://aryion.com/g4/favorites/USER" + + def posts(self): + url = "{}/g4/favorites/{}".format(self.root, self.user) + return self._pagination_params( + url, None, "class='gallery-item favorite' id='") + + class AryionTagExtractor(AryionExtractor): """Extractor for tag searches on eka's portal""" subcategory = "tag" diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index ad0caf9..f24059f 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -152,8 +152,16 @@ class BehanceGalleryExtractor(BehanceExtractor): continue if mtype == "image": - url = module["imageSizes"]["size_original"]["url"] - append((url, module)) + sizes = { + size["url"].rsplit("/", 2)[1]: size + for size in module["imageSizes"]["allAvailable"] + } + size = (sizes.get("source") or + sizes.get("max_3840") or + sizes.get("fs") or + sizes.get("hd") or + sizes.get("disp")) + append((size["url"], module)) elif mtype == "video": try: diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index cbd0e07..7e26f38 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -29,16 +29,21 @@ class BooruExtractor(BaseExtractor): url_key = self.config("url") if url_key: - self._file_url = operator.itemgetter(url_key) + if isinstance(url_key, (list, tuple)): + self._file_url = self._file_url_list + self._file_url_keys = url_key + else: + self._file_url = operator.itemgetter(url_key) for post in self.posts(): try: url = self._file_url(post) if url[0] == "/": url = self.root + url - except (KeyError, TypeError): - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + self.log.warning("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) continue if fetch_html: @@ -73,6 +78,11 @@ class BooruExtractor(BaseExtractor): _file_url = operator.itemgetter("file_url") + def _file_url_list(self, post): + urls = (post[key] for key in self._file_url_keys if post.get(key)) + post["_fallback"] = it = iter(urls) + return next(it) + def _prepare(self, post): """Prepare a 'post's metadata""" diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index a093347..77f0de6 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -13,7 +13,7 @@ from .. import text BASE_PATTERN = ( r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|ru|la|is|to|ac|black|cat|media|red|site|ws))" + r"\.(?:s[kiu]|fi|ru|la|is|to|ac|black|cat|media|red|site|ws))" ) LEGACY_DOMAINS = { diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py new file mode 100644 index 0000000..bae86d0 --- /dev/null +++ b/gallery_dl/extractor/cien.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ci-en.net/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" + + +class CienExtractor(Extractor): + category = "cien" + root = "https://ci-en.net" + request_interval = (1.0, 2.0) + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + Extractor.__init__(self, match) + + def _init(self): + self.cookies.set("accepted_rating", "r18g", domain="ci-en.dlsite.com") + + def _pagination_articles(self, url, params): + data = {"_extractor": CienArticleExtractor} + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for card in text.extract_iter( + page, ' class="c-cardCase-item', '</div>'): + article_url = text.extr(card, ' href="', '"') + yield Message.Queue, article_url, data + + if ' rel="next"' not in page: + return + params["page"] += 1 + + +class CienArticleExtractor(CienExtractor): + subcategory = "article" + filename_fmt = "{num:>02} {filename}.{extension}" + directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}") + archive_fmt = "{post_id}_{num}" + pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + example = "https://ci-en.net/creator/123/article/12345" + + def items(self): + url = "{}/creator/{}/article/{}".format( + self.root, self.groups[0], self.groups[1]) + page = self.request(url, notfound="article").text + + post = util.json_loads(text.extr( + page, '<script type="application/ld+json">', '</script>'))[0] + + files = self._extract_files(post.get("articleBody") or page) + + post["post_url"] = url + post["post_id"] = text.parse_int(self.groups[1]) + post["count"] = len(files) + post["date"] = text.parse_datetime(post["datePublished"]) + + try: + del post["publisher"] + del post["sameAs"] + except Exception: + pass + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post.update(file) + if "extension" not in file: + text.nameext_from_url(file["url"], post) + yield Message.Url, file["url"], post + + def _extract_files(self, page): + files = [] + + filetypes = self.config("files") + if filetypes is None: + self._extract_files_image(page, files) + self._extract_files_video(page, files) + self._extract_files_download(page, files) + self._extract_files_gallery(page, files) + else: + generators = { + "image" : self._extract_files_image, + "video" : self._extract_files_video, + "download": self._extract_files_download, + "gallery" : self._extract_files_gallery, + "gallerie": self._extract_files_gallery, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + for ft in filetypes: + generators[ft.rstrip("s")](page, files) + + return files + + def _extract_files_image(self, page, files): + for image in text.extract_iter( + page, 'class="file-player-image"', "</figure>"): + size = text.extr(image, ' data-size="', '"') + w, _, h = size.partition("x") + + files.append({ + "url" : text.extr(image, ' data-raw="', '"'), + "width" : text.parse_int(w), + "height": text.parse_int(h), + "type" : "image", + }) + + def _extract_files_video(self, page, files): + for video in text.extract_iter( + page, "<vue-file-player", "</vue-file-player>"): + path = text.extr(video, ' base-path="', '"') + name = text.extr(video, ' file-name="', '"') + auth = text.extr(video, ' auth-key="', '"') + + file = text.nameext_from_url(name) + file["url"] = "{}video-web.mp4?{}".format(path, auth) + file["type"] = "video" + files.append(file) + + def _extract_files_download(self, page, files): + for download in text.extract_iter( + page, 'class="downloadBlock', "</div>"): + name = text.extr(download, "<p>", "<") + + file = text.nameext_from_url(name.rpartition(" ")[0]) + file["url"] = text.extr(download, ' href="', '"') + file["type"] = "download" + files.append(file) + + def _extract_files_gallery(self, page, files): + for gallery in text.extract_iter( + page, "<vue-image-gallery", "</vue-image-gallery>"): + + url = self.root + "/api/creator/gallery/images" + params = { + "hash" : text.extr(gallery, ' hash="', '"'), + "gallery_id": text.extr(gallery, ' gallery-id="', '"'), + "time" : text.extr(gallery, ' time="', '"'), + } + data = self.request(url, params=params).json() + url = self.root + "/api/creator/gallery/imagePath" + + for params["page"], params["file_id"] in enumerate( + data["imgList"]): + path = self.request(url, params=params).json()["path"] + + file = params.copy() + file["url"] = path + files.append(file) + + +class CienCreatorExtractor(CienExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + example = "https://ci-en.net/creator/123" + + def items(self): + url = "{}/creator/{}/article".format(self.root, self.groups[0]) + params = text.parse_query(self.groups[1]) + params["mode"] = "list" + return self._pagination_articles(url, params) + + +class CienRecentExtractor(CienExtractor): + subcategory = "recent" + pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + example = "https://ci-en.net/mypage/recent" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) + + +class CienFollowingExtractor(CienExtractor): + subcategory = "following" + pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + example = "https://ci-en.net/mypage/subscription" + + def items(self): + url = self.root + "/mypage/subscription" + (self.groups[0] or "") + page = self.request(url).text + data = {"_extractor": CienCreatorExtractor} + + for subscription in text.extract_iter( + page, 'class="c-grid-subscriptionInfo', '</figure>'): + url = text.extr(subscription, ' href="', '"') + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index d7a41bc..df70571 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -378,7 +378,7 @@ class Extractor(): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:109.0) Gecko/20100101 Firefox/115.0") + "rv:128.0) Gecko/20100101 Firefox/128.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -390,6 +390,8 @@ class Extractor(): headers["Accept-Encoding"] = "gzip, deflate, br" else: headers["Accept-Encoding"] = "gzip, deflate" + if ZSTD: + headers["Accept-Encoding"] += ", zstd" referer = self.config("referer", self.referer) if referer: @@ -789,10 +791,11 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - Extractor.__init__(self, match) if not self.category: + self.groups = match.groups() + self.match = match self._init_category() - self._cfgpath = ("extractor", self.category, self.subcategory) + Extractor.__init__(self, match) def _init_category(self): for index, group in enumerate(self.groups): @@ -911,13 +914,12 @@ _browser_cookies = {} HTTP_HEADERS = { "firefox": ( ("User-Agent", "Mozilla/5.0 ({}; " - "rv:109.0) Gecko/20100101 Firefox/115.0"), + "rv:128.0) Gecko/20100101 Firefox/128.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8"), + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), ("Accept-Encoding", None), ("Referer", None), - ("DNT", "1"), ("Connection", "keep-alive"), ("Upgrade-Insecure-Requests", "1"), ("Cookie", None), @@ -991,6 +993,12 @@ try: except AttributeError: BROTLI = False +# detect zstandard support +try: + ZSTD = urllib3.response.HAS_ZSTD +except AttributeError: + ZSTD = False + # set (urllib3) warnings filter action = config.get((), "warnings", "default") if action: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2199cc8..a70710c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -846,55 +846,6 @@ class DeviantartStatusExtractor(DeviantartExtractor): ) -class DeviantartPopularExtractor(DeviantartExtractor): - """Extractor for popular deviations""" - subcategory = "popular" - directory_fmt = ("{category}", "Popular", - "{popular[range]}", "{popular[search]}") - archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" - pattern = (r"(?:https?://)?www\.deviantart\.com/(?:" - r"(?:deviations/?)?\?order=(popular-[^/?#]+)" - r"|((?:[\w-]+/)*)(popular-[^/?#]+)" - r")/?(?:\?([^#]*))?") - example = "https://www.deviantart.com/popular-24-hours/" - - def __init__(self, match): - DeviantartExtractor.__init__(self, match) - self.user = "" - - trange1, path, trange2, query = match.groups() - query = text.parse_query(query) - self.search_term = query.get("q") - - trange = trange1 or trange2 or query.get("order", "") - if trange.startswith("popular-"): - trange = trange[8:] - self.time_range = { - "newest" : "now", - "most-recent" : "now", - "this-week" : "1week", - "this-month" : "1month", - "this-century": "alltime", - "all-time" : "alltime", - }.get(trange, "alltime") - - self.popular = { - "search": self.search_term or "", - "range" : trange or "all-time", - "path" : path.strip("/") if path else "", - } - - def deviations(self): - if self.time_range == "now": - return self.api.browse_newest(self.search_term, self.offset) - return self.api.browse_popular( - self.search_term, self.time_range, self.offset) - - def prepare(self, deviation): - DeviantartExtractor.prepare(self, deviation) - deviation["popular"] = self.popular - - class DeviantartTagExtractor(DeviantartExtractor): """Extractor for deviations from tag searches""" subcategory = "tag" @@ -1077,14 +1028,14 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): class DeviantartFollowingExtractor(DeviantartExtractor): """Extractor for user's watched users""" subcategory = "following" - pattern = BASE_PATTERN + "/about#watching$" + pattern = BASE_PATTERN + "/(?:about#)?watching" example = "https://www.deviantart.com/USER/about#watching" def items(self): - eclipse_api = DeviantartEclipseAPI(self) + api = DeviantartOAuthAPI(self) - for user in eclipse_api.user_watching(self.user, self.offset): - url = "{}/{}".format(self.root, user["username"]) + for user in api.user_friends(self.user): + url = "{}/{}".format(self.root, user["user"]["username"]) user["_extractor"] = DeviantartUserExtractor yield Message.Queue, url, user @@ -1095,7 +1046,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor): class DeviantartOAuthAPI(): """Interface for the DeviantArt OAuth API - Ref: https://www.deviantart.com/developers/http/v1/20160316 + https://www.deviantart.com/developers/http/v1/20160316 """ CLIENT_ID = "5388" CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1" @@ -1188,29 +1139,6 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination(endpoint, params, public=False, unpack=True) - def browse_newest(self, query=None, offset=0): - """Browse newest deviations""" - endpoint = "/browse/newest" - params = { - "q" : query, - "limit" : 120, - "offset" : offset, - "mature_content": self.mature, - } - return self._pagination(endpoint, params) - - def browse_popular(self, query=None, timerange=None, offset=0): - """Yield popular deviations""" - endpoint = "/browse/popular" - params = { - "q" : query, - "limit" : 120, - "timerange" : timerange, - "offset" : offset, - "mature_content": self.mature, - } - return self._pagination(endpoint, params) - def browse_tags(self, tag, offset=0): """ Browse a tag """ endpoint = "/browse/tags" @@ -1223,11 +1151,12 @@ class DeviantartOAuthAPI(): return self._pagination(endpoint, params) def browse_user_journals(self, username, offset=0): - """Yield all journal entries of a specific user""" - endpoint = "/browse/user/journals" - params = {"username": username, "offset": offset, "limit": 50, - "mature_content": self.mature, "featured": "false"} - return self._pagination(endpoint, params) + journals = filter( + lambda post: "/journal/" in post["url"], + self.user_profile_posts(username)) + if offset: + journals = util.advance(journals, offset) + return journals def collections(self, username, folder_id, offset=0): """Yield all Deviation-objects contained in a collection folder""" @@ -1339,16 +1268,10 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - @memcache(keyarg=1) - def user_profile(self, username): - """Get user profile information""" - endpoint = "/user/profile/" + username - return self._call(endpoint, fatal=False) - - def user_statuses(self, username, offset=0): - """Yield status updates of a specific user""" - endpoint = "/user/statuses/" - params = {"username": username, "offset": offset, "limit": 50} + def user_friends(self, username, offset=0): + """Get the users list of friends""" + endpoint = "/user/friends/" + username + params = {"limit": 50, "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params) def user_friends_watch(self, username): @@ -1376,6 +1299,27 @@ class DeviantartOAuthAPI(): endpoint, method="POST", public=False, fatal=False, ).get("success") + @memcache(keyarg=1) + def user_profile(self, username): + """Get user profile information""" + endpoint = "/user/profile/" + username + return self._call(endpoint, fatal=False) + + def user_profile_posts(self, username): + endpoint = "/user/profile/posts" + params = {"username": username, "limit": 50, + "mature_content": self.mature} + return self._pagination(endpoint, params) + + def user_statuses(self, username, offset=0): + """Yield status updates of a specific user""" + statuses = filter( + lambda post: "/status-update/" in post["url"], + self.user_profile_posts(username)) + if offset: + statuses = util.advance(statuses, offset) + return statuses + def authenticate(self, refresh_token_key): """Authenticate the application by requesting an access token""" self.headers["Authorization"] = \ @@ -1464,7 +1408,7 @@ class DeviantartOAuthAPI(): self.log.error(msg) return data - def _switch_tokens(self, results, params): + def _should_switch_tokens(self, results, params): if len(results) < params["limit"]: return True @@ -1496,7 +1440,7 @@ class DeviantartOAuthAPI(): results = [item["journal"] for item in results if "journal" in item] if extend: - if public and self._switch_tokens(results, params): + if public and self._should_switch_tokens(results, params): if self.refresh_token_key: self.log.debug("Switching to private access token") public = False @@ -1540,6 +1484,11 @@ class DeviantartOAuthAPI(): return params["offset"] = int(params["offset"]) + len(results) + def _pagination_list(self, endpoint, params, key="results"): + result = [] + result.extend(self._pagination(endpoint, params, False, key=key)) + return result + @staticmethod def _shared_content(results): """Return an iterable of shared deviations in 'results'""" @@ -1548,11 +1497,6 @@ class DeviantartOAuthAPI(): if "deviation" in item: yield item["deviation"] - def _pagination_list(self, endpoint, params, key="results"): - result = [] - result.extend(self._pagination(endpoint, params, False, key=key)) - return result - def _metadata(self, deviations): """Add extended metadata to each deviation object""" if len(deviations) <= self.limit: diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 26f2184..2f0230a 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -18,7 +18,8 @@ class DirectlinkExtractor(Extractor): filename_fmt = "{domain}/{path}/{filename}.{extension}" archive_fmt = filename_fmt pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\." - r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" + r"(?:jpe?g|jpe|png|gif|bmp|svg|web[mp]|avif|heic|psd" + r"|mp4|m4v|mov|mkv|og[gmv]|wav|mp3|opus|zip|rar|7z|pdf|swf))" r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$") example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png" diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 733d0d8..583869f 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -66,6 +66,8 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): text.extr(group, ' alt="', '"')), "date" : text.parse_datetime(extr( '"icon-calendar"></i> ', '<'), "%b %d, %Y"), + "tags" : text.split_html(extr( + "class='tags'>", "<div id='chapter-actions'")), "lang" : "en", "language": "English", } diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 8c9da2f..e6d136f 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -46,18 +46,24 @@ class EromeExtractor(Extractor): page, 'href="https://www.erome.com/', '"', pos) urls = [] + date = None groups = page.split('<div class="media-group"') for group in util.advance(groups, 1): url = (text.extr(group, '<source src="', '"') or text.extr(group, 'data-src="', '"')) if url: urls.append(url) + if not date: + ts = text.extr(group, '?v=', '"') + if len(ts) > 1: + date = text.parse_timestamp(ts) data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "count" : len(urls), + "date" : date, "_http_headers": {"Referer": url}, } diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 1805403..1b4f995 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -394,6 +394,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.original = False return self.data["_url_1280"] + if " temporarily banned " in page: + raise exception.AuthorizationError("Temporarily Banned") + self._report_limits() return True diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py deleted file mode 100644 index 650a707..0000000 --- a/gallery_dl/extractor/fallenangels.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2017-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.fascans.com/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text, util - - -class FallenangelsChapterExtractor(ChapterExtractor): - """Extractor for manga chapters from fascans.com""" - category = "fallenangels" - pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com" - r"/manga/([^/?#]+)/([^/?#]+)") - example = "https://manga.fascans.com/manga/NAME/CHAPTER/" - - def __init__(self, match): - self.version, self.manga, self.chapter = match.groups() - url = "https://{}.fascans.com/manga/{}/{}/1".format( - self.version, self.manga, self.chapter) - ChapterExtractor.__init__(self, match, url) - - def metadata(self, page): - extr = text.extract_from(page) - lang = "vi" if self.version == "truyen" else "en" - chapter, sep, minor = self.chapter.partition(".") - return { - "manga" : extr('name="description" content="', ' Chapter '), - "title" : extr(': ', ' - Page 1'), - "chapter" : chapter, - "chapter_minor": sep + minor, - "lang" : lang, - "language": util.code_to_language(lang), - } - - @staticmethod - def images(page): - return [ - (img["page_image"], None) - for img in util.json_loads( - text.extr(page, "var pages = ", ";") - ) - ] - - -class FallenangelsMangaExtractor(MangaExtractor): - """Extractor for manga from fascans.com""" - chapterclass = FallenangelsChapterExtractor - category = "fallenangels" - pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" - example = "https://manga.fascans.com/manga/NAME" - - def __init__(self, match): - url = "https://" + match.group(1) - self.lang = "vi" if match.group(2) == "truyen" else "en" - MangaExtractor.__init__(self, match, url) - - def chapters(self, page): - extr = text.extract_from(page) - results = [] - language = util.code_to_language(self.lang) - while extr('<li style="', '"'): - vol = extr('class="volume-', '"') - url = extr('href="', '"') - cha = extr('>', '<') - title = extr('<em>', '</em>') - - manga, _, chapter = cha.rpartition(" ") - chapter, dot, minor = chapter.partition(".") - results.append((url, { - "manga" : manga, - "title" : text.unescape(title), - "volume" : text.parse_int(vol), - "chapter" : text.parse_int(chapter), - "chapter_minor": dot + minor, - "lang" : self.lang, - "language": language, - })) - return results diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 6040187..f48a984 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -113,6 +113,12 @@ class FuraffinityExtractor(Extractor): data["gender"] = rh(extr('>Gender</strong>', '</div>')) data["width"] = pi(extr("<span>", "x")) data["height"] = pi(extr("", "p")) + data["folders"] = folders = [] + for folder in extr( + "<h3>Listed in Folders</h3>", "</section>").split("</a>"): + folder = rh(folder) + if folder: + folders.append(folder) else: # old site layout data["title"] = text.unescape(extr("<h2>", "</h2>")) @@ -132,11 +138,14 @@ class FuraffinityExtractor(Extractor): data["_description"] = extr( '<td valign="top" align="left" width="70%" class="alt1" ' 'style="padding:8px">', ' </td>') + data["folders"] = () # folders not present in old layout data["artist_url"] = data["artist"].replace("_", "").lower() data["user"] = self.user or data["artist_url"] data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) + data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format( + post_id, path.rsplit("/", 2)[1]) return data diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 8d8b8ad..fbbd26c 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -36,7 +36,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start params["limit"] = self.per_page - post = None + post = total = None + count = 0 + while True: try: root = self._api_request(params) @@ -50,12 +52,29 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = 0 continue + if total is None: + try: + total = int(root.attrib["count"]) + self.log.debug("%s posts in total", total) + except Exception as exc: + total = 0 + self.log.debug( + "Failed to get total number of posts (%s: %s)", + exc.__class__.__name__, exc) + post = None for post in root: yield post.attrib - if len(root) < self.per_page: - return + num = len(root) + count += num + if num < self.per_page: + if not total or count >= total: + return + if not num: + self.log.debug("Empty response - Retrying") + continue + params["pid"] += 1 def _pagination_html(self, params): diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 97b7844..286ee38 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -70,10 +70,13 @@ class HentainexusGalleryExtractor(GalleryExtractor): for img in imgs: img["_http_headers"] = headers - return [ - (img["image"], img) - for img in imgs - ] + results = [] + for img in imgs: + try: + results.append((img["image"], img)) + except KeyError: + pass + return results @staticmethod def _decode(data): diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index a2b51be..34fbabd 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,12 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + if self.type == "photo": + post["url"] = ( + post["url"] + .replace("/storage/storage/", "/storage/") + .replace("_thumb.", ".") + ) post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 85446c0..345f51d 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -19,7 +19,7 @@ class ImagefapExtractor(Extractor): category = "imagefap" root = "https://www.imagefap.com" directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{filename}.{extension}" + filename_fmt = "{category}_{gallery_id}_{num:04}_{filename}.{extension}" archive_fmt = "{gallery_id}_{image_id}" request_interval = (2.0, 4.0) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 2ae8cbe..f3098f1 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -246,14 +246,12 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor): data = {"_extractor": InkbunnyUserExtractor} while True: - cnt = 0 for user in text.extract_iter( page, '<a class="widget_userNameSmall" href="', '"', page.index('id="changethumboriginal_form"')): - cnt += 1 yield Message.Queue, self.root + user, data - if cnt < 20: + if "<a title='next page' " not in page: return params["page"] += 1 page = self.request(url, params=params).text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index f7a5cc7..dbe2df3 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -596,6 +596,22 @@ class InstagramTagExtractor(InstagramExtractor): return self.api.tags_media(self.item) +class InstagramInfoExtractor(InstagramExtractor): + """Extractor for an Instagram user's profile data""" + subcategory = "info" + pattern = USER_PATTERN + r"/info" + example = "https://www.instagram.com/USER/info/" + + def items(self): + screen_name = self.item + if screen_name.startswith("id:"): + user = self.api.user_by_id(screen_name[3:]) + else: + user = self.api.user_by_name(screen_name) + + return iter(((Message.Directory, user),)) + + class InstagramAvatarExtractor(InstagramExtractor): """Extractor for an Instagram user's avatar""" subcategory = "avatar" @@ -975,9 +991,9 @@ class InstagramGraphqlAPI(): if not info["has_next_page"]: return extr._update_cursor(None) elif not data["edges"]: - s = "" if self.item.endswith("s") else "s" + s = "" if self.extractor.item.endswith("s") else "s" raise exception.StopExtraction( - "%s'%s posts are private", self.item, s) + "%s'%s posts are private", self.extractor.item, s) variables["after"] = extr._update_cursor(info["end_cursor"]) diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py new file mode 100644 index 0000000..979b1a2 --- /dev/null +++ b/gallery_dl/extractor/koharu.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://koharu.to/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +from ..cache import cache + +BASE_PATTERN = r"(?i)(?:https?://)?(?:koharu|anchira)\.to" + + +class KoharuExtractor(Extractor): + """Base class for koharu extractors""" + category = "koharu" + root = "https://koharu.to" + root_api = "https://api.koharu.to" + request_interval = (0.5, 1.5) + + def _init(self): + self.headers = { + "Accept" : "*/*", + "Referer": self.root + "/", + "Origin" : self.root, + } + + def _pagination(self, endpoint, params): + url_api = self.root_api + endpoint + + while True: + data = self.request( + url_api, params=params, headers=self.headers).json() + + try: + entries = data["entries"] + except KeyError: + return + + for entry in entries: + url = "{}/g/{}/{}".format( + self.root, entry["id"], entry["public_key"]) + entry["_extractor"] = KoharuGalleryExtractor + yield Message.Queue, url, entry + + try: + if data["limit"] * data["page"] >= data["total"]: + return + except Exception: + pass + params["page"] += 1 + + +class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): + """Extractor for koharu galleries""" + filename_fmt = "{num:>03}.{extension}" + directory_fmt = ("{category}", "{id} {title}") + archive_fmt = "{id}_{num}" + request_interval = 0.0 + pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)" + example = "https://koharu.to/g/12345/67890abcde/" + + TAG_TYPES = { + 0 : "general", + 1 : "artist", + 2 : "circle", + 3 : "parody", + 4 : "magazine", + 5 : "character", + 6 : "", + 7 : "uploader", + 8 : "male", + 9 : "female", + 10: "mixed", + 11: "language", + 12: "other", + } + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_url = None + + def _init(self): + self.headers = { + "Accept" : "*/*", + "Referer": self.root + "/", + "Origin" : self.root, + } + + self.fmt = self.config("format") + self.cbz = self.config("cbz", True) + + if self.cbz: + self.filename_fmt = "{id} {title}.{extension}" + self.directory_fmt = ("{category}",) + + def metadata(self, _): + url = "{}/books/detail/{}/{}".format( + self.root_api, self.groups[0], self.groups[1]) + self.data = data = self.request(url, headers=self.headers).json() + + tags = [] + for tag in data["tags"]: + name = tag["name"] + namespace = tag.get("namespace", 0) + tags.append(self.TAG_TYPES[namespace] + ":" + name) + data["tags"] = tags + data["date"] = text.parse_timestamp(data["created_at"] // 1000) + + try: + if self.cbz: + data["count"] = len(data["thumbnails"]["entries"]) + del data["thumbnails"] + del data["rels"] + except Exception: + pass + + return data + + def images(self, _): + data = self.data + fmt = self._select_format(data["data"]) + + url = "{}/books/data/{}/{}/{}/{}".format( + self.root_api, + data["id"], data["public_key"], + fmt["id"], fmt["public_key"], + ) + params = { + "v": data["updated_at"], + "w": fmt["w"], + } + + if self.cbz: + params["action"] = "dl" + base = self.request( + url, method="POST", params=params, headers=self.headers, + ).json()["base"] + url = "{}?v={}&w={}".format(base, data["updated_at"], fmt["w"]) + info = text.nameext_from_url(base) + if not info["extension"]: + info["extension"] = "cbz" + return ((url, info),) + + data = self.request(url, params=params, headers=self.headers).json() + base = data["base"] + + results = [] + for entry in data["entries"]: + dimensions = entry["dimensions"] + info = { + "w": dimensions[0], + "h": dimensions[1], + "_http_headers": self.headers, + } + results.append((base + entry["path"], info)) + return results + + def _select_format(self, formats): + if not self.fmt or self.fmt == "original": + fmtid = "0" + else: + fmtid = str(self.fmt) + + try: + fmt = formats[fmtid] + except KeyError: + raise exception.NotFoundError("format") + + fmt["w"] = fmtid + return fmt + + +class KoharuSearchExtractor(KoharuExtractor): + """Extractor for koharu search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/\?([^#]*)" + example = "https://koharu.to/?s=QUERY" + + def items(self): + params = text.parse_query(self.groups[0]) + params["page"] = text.parse_int(params.get("page"), 1) + return self._pagination("/books", params) + + +class KoharuFavoriteExtractor(KoharuExtractor): + """Extractor for koharu favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + example = "https://koharu.to/favorites" + + def items(self): + self.login() + + params = text.parse_query(self.groups[0]) + params["page"] = text.parse_int(params.get("page"), 1) + return self._pagination("/favorites", params) + + def login(self): + username, password = self._get_auth_info() + if username: + self.headers["Authorization"] = \ + "Bearer " + self._login_impl(username, password) + return + + raise exception.AuthenticationError("Username and password required") + + @cache(maxage=86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = "https://auth.koharu.to/login" + data = {"uname": username, "passwd": password} + response = self.request( + url, method="POST", headers=self.headers, data=data) + + return response.json()["session"] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 60cca22..b01c591 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -120,7 +120,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): ] else: pos = page.find('id="view-center"') + 1 - return (text.extr(page, 'itemprop="image" src="', '"', pos),) + # do NOT use text.extr() here, as it doesn't support a pos argument + return (text.extract(page, 'itemprop="image" src="', '"', pos)[0],) @staticmethod def _extract_user_name(page): diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index b21e1eb..2330b08 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -77,6 +77,7 @@ class PahealTagExtractor(PahealExtractor): pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") example = "https://rule34.paheal.net/post/list/TAG/1" + page_start = 1 per_page = 70 def __init__(self, match): @@ -87,11 +88,16 @@ class PahealTagExtractor(PahealExtractor): if self.config("metadata"): self._extract_data = self._extract_data_ex + def skip(self, num): + pages = num // self.per_page + self.page_start += pages + return pages * self.per_page + def get_metadata(self): return {"search_tags": self.tags} def get_posts(self): - pnum = 1 + pnum = self.page_start while True: url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 115de9a..271fa50 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -78,12 +78,16 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): } def images(self, page): - return [ - (beau(url), None) - for url in text.extract_iter( - page, "lstImages.push('", "'", - ) - ] + results = [] + + for block in page.split(" pth = '")[1:]: + pth = text.extr(block, "", "'") + for needle, repl in re.findall( + r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block): + pth = pth.replace(needle, repl) + results.append((beau(pth), None)) + + return results class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): @@ -116,9 +120,9 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): def beau(url): - """https://readcomiconline.li/Scripts/rguard.min.js""" - url = url.replace("_x236", "d") - url = url.replace("_x945", "g") + """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.1""" + url = url.replace("pw_.g28x", "b") + url = url.replace("d2pr.x_27", "h") if url.startswith("https"): return url @@ -126,8 +130,8 @@ def beau(url): url, sep, rest = url.partition("?") containsS0 = "=s0" in url url = url[:-3 if containsS0 else -6] - url = url[4:22] + url[25:] - url = url[0:-6] + url[-2:] + url = url[15:33] + url[50:] + url = url[0:-11] + url[-2:] url = binascii.a2b_base64(url).decode() url = url[0:13] + url[17:] url = url[0:-2] + ("=s0" if containsS0 else "=s1600") diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 327bcd1..506f6ac 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -190,7 +190,7 @@ class RedgifsImageExtractor(RedgifsExtractor): r"(?:\w+\.)?redgifs\.com/(?:watch|ifr)|" r"(?:\w+\.)?gfycat\.com(?:/gifs/detail|/\w+)?|" r"(?:www\.)?gifdeliverynetwork\.com|" - r"i\.redgifs\.com/i)/([A-Za-z]+)") + r"i\.redgifs\.com/i)/([A-Za-z0-9]+)") example = "https://redgifs.com/watch/ID" def gifs(self): diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index caf3e16..ad3efa7 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -16,7 +16,7 @@ import collections import re BASE_PATTERN = r"(?:https?://)?" \ - r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ + r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ r"(?:/[a-z]{2})?" @@ -45,6 +45,9 @@ class SankakuExtractor(BooruExtractor): def skip(self, num): return 0 + def _init(self): + self.api = SankakuAPI(self) + def _file_url(self, post): url = post["file_url"] if not url: @@ -81,6 +84,15 @@ class SankakuExtractor(BooruExtractor): post["tags_" + key] = value post["tag_string_" + key] = " ".join(value) + def _notes(self, post, page): + if post.get("has_notes"): + post["notes"] = self.api.notes(post["id"]) + for note in post["notes"]: + note["created_at"] = note["created_at"]["s"] + note["updated_at"] = note["updated_at"]["s"] + else: + post["notes"] = () + class SankakuTagExtractor(SankakuExtractor): """Extractor for images from sankaku.app by search-tags""" @@ -109,7 +121,7 @@ class SankakuTagExtractor(SankakuExtractor): def posts(self): params = {"tags": self.tags} - return SankakuAPI(self).posts_keyset(params) + return self.api.posts_keyset(params) class SankakuPoolExtractor(SankakuExtractor): @@ -125,7 +137,7 @@ class SankakuPoolExtractor(SankakuExtractor): self.pool_id = match.group(1) def metadata(self): - pool = SankakuAPI(self).pools(self.pool_id) + pool = self.api.pools(self.pool_id) pool["tags"] = [tag["name"] for tag in pool["tags"]] pool["artist_tags"] = [tag["name"] for tag in pool["artist_tags"]] @@ -151,7 +163,7 @@ class SankakuPostExtractor(SankakuExtractor): self.post_id = match.group(1) def posts(self): - return SankakuAPI(self).posts(self.post_id) + return self.api.posts(self.post_id) class SankakuBooksExtractor(SankakuExtractor): @@ -167,7 +179,7 @@ class SankakuBooksExtractor(SankakuExtractor): def items(self): params = {"tags": self.tags, "pool_type": "0"} - for pool in SankakuAPI(self).pools_keyset(params): + for pool in self.api.pools_keyset(params): pool["_extractor"] = SankakuPoolExtractor url = "https://sankaku.app/books/{}".format(pool["id"]) yield Message.Queue, url, pool @@ -192,6 +204,10 @@ class SankakuAPI(): if not self.username: self.authenticate = util.noop + def notes(self, post_id): + params = {"lang": "en"} + return self._call("/posts/{}/notes".format(post_id), params) + def pools(self, pool_id): params = {"lang": "en"} return self._call("/pools/" + pool_id, params) diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index e1d4153..50c21e3 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://www.sankakucomplex.com/""" +"""Extractors for https://news.sankakucomplex.com/""" from .common import Extractor, Message from .. import text, util @@ -16,7 +16,7 @@ import re class SankakucomplexExtractor(Extractor): """Base class for sankakucomplex extractors""" category = "sankakucomplex" - root = "https://www.sankakucomplex.com" + root = "https://news.sankakucomplex.com" def __init__(self, match): Extractor.__init__(self, match) @@ -24,14 +24,14 @@ class SankakucomplexExtractor(Extractor): class SankakucomplexArticleExtractor(SankakucomplexExtractor): - """Extractor for articles on www.sankakucomplex.com""" + """Extractor for articles on news.sankakucomplex.com""" subcategory = "article" directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{date:%Y%m%d}_{filename}" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com" r"/(\d\d\d\d/\d\d/\d\d/[^/?#]+)") - example = "https://www.sankakucomplex.com/1970/01/01/TITLE" + example = "https://news.sankakucomplex.com/1970/01/01/TITLE" def items(self): url = "{}/{}/?pg=X".format(self.root, self.path) @@ -87,9 +87,9 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): class SankakucomplexTagExtractor(SankakucomplexExtractor): """Extractor for sankakucomplex blog articles by tag or author""" subcategory = "tag" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com" r"/((?:tag|category|author)/[^/?#]+)") - example = "https://www.sankakucomplex.com/tag/TAG/" + example = "https://news.sankakucomplex.com/tag/TAG/" def items(self): pnum = 1 diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 0abb3ab..7c760ac 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -127,6 +127,8 @@ class SubscribestarExtractor(Extractor): } def _parse_datetime(self, dt): + if dt.startswith("Updated on "): + dt = dt[11:] date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p") if date is dt: date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p") diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 78ff265..64fa951 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -77,23 +77,27 @@ class ToyhouseExtractor(Extractor): cnt += 1 yield self._parse_post(post) - if cnt == 0 and params["page"] == 1: - token, pos = text.extract( - page, '<input name="_token" type="hidden" value="', '"') - if not token: - return - data = { - "_token": token, - "user" : text.extract(page, 'value="', '"', pos)[0], - } - self.request(self.root + "/~account/warnings/accept", - method="POST", data=data, allow_redirects=False) - continue + if not cnt and params["page"] == 1: + if self._accept_content_warning(page): + continue + return if cnt < 18: return params["page"] += 1 + def _accept_content_warning(self, page): + pos = page.find(' name="_token"') + 1 + token, pos = text.extract(page, ' value="', '"', pos) + user , pos = text.extract(page, ' value="', '"', pos) + if not token or not user: + return False + + data = {"_token": token, "user": user} + self.request(self.root + "/~account/warnings/accept", + method="POST", data=data, allow_redirects=False) + return True + class ToyhouseArtExtractor(ToyhouseExtractor): """Extractor for artworks of a toyhouse user""" diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index c34910f..ff29c04 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -386,7 +386,7 @@ class TumblrAPI(oauth.OAuth1API): def posts(self, blog, params): """Retrieve published posts""" params["offset"] = self.extractor.config("offset") - params["limit"] = "50" + params["limit"] = 50 params["reblog_info"] = "true" params["type"] = self.posts_type params["before"] = self.before @@ -398,8 +398,14 @@ class TumblrAPI(oauth.OAuth1API): def likes(self, blog): """Retrieve liked posts""" + endpoint = "/v2/blog/{}/likes".format(blog) params = {"limit": "50", "before": self.before} - return self._pagination(blog, "/likes", params, key="liked_posts") + while True: + posts = self._call(endpoint, params)["liked_posts"] + if not posts: + return + yield from posts + params["before"] = posts[-1]["liked_timestamp"] def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint @@ -474,6 +480,7 @@ class TumblrAPI(oauth.OAuth1API): if self.api_key: params["api_key"] = self.api_key + strategy = self.extractor.config("pagination") while True: data = self._call(endpoint, params) @@ -481,13 +488,31 @@ class TumblrAPI(oauth.OAuth1API): self.BLOG_CACHE[blog] = data["blog"] cache = False - yield from data[key] - - try: - endpoint = data["_links"]["next"]["href"] - except KeyError: - return + posts = data[key] + yield from posts - params = None - if self.api_key: - endpoint += "&api_key=" + self.api_key + if strategy == "api": + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key + + elif strategy == "before": + if not posts: + return + timestamp = posts[-1]["timestamp"] + 1 + if params["before"] and timestamp >= params["before"]: + return + params["before"] = timestamp + params["offset"] = None + + else: # offset + params["offset"] = \ + text.parse_int(params["offset"]) + params["limit"] + params["before"] = None + if params["offset"] >= data["total_posts"]: + return diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ec098aa..9fa5b3f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -51,6 +51,8 @@ class TwitterExtractor(Extractor): if not self.config("transform", True): self._transform_user = util.identity self._transform_tweet = util.identity + + self._cursor = None self._user = None self._user_obj = None self._user_cache = {} @@ -321,8 +323,17 @@ class TwitterExtractor(Extractor): "quote_count" : tget("quote_count"), "reply_count" : tget("reply_count"), "retweet_count" : tget("retweet_count"), + "bookmark_count": tget("bookmark_count"), } + if "views" in tweet: + try: + tdata["view_count"] = int(tweet["views"]["count"]) + except Exception: + tdata["view_count"] = 0 + else: + tdata["view_count"] = 0 + if "note_tweet" in tweet: note = tweet["note_tweet"]["note_tweet_results"]["result"] content = note["text"] @@ -492,6 +503,14 @@ class TwitterExtractor(Extractor): }, } + def _init_cursor(self): + return self.config("cursor") or None + + def _update_cursor(self, cursor): + self.log.debug("Cursor: %s", cursor) + self._cursor = cursor + return cursor + def metadata(self): """Return general metadata""" return {} @@ -499,6 +518,11 @@ class TwitterExtractor(Extractor): def tweets(self): """Yield all relevant tweet objects""" + def finalize(self): + if self._cursor: + self.log.info("Use '-o cursor=%s' to continue downloading " + "from the current position", self._cursor) + def login(self): if self.cookies_check(self.cookies_names): return @@ -530,6 +554,9 @@ class TwitterUserExtractor(TwitterExtractor): def initialize(self): pass + def finalize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( @@ -549,30 +576,73 @@ class TwitterTimelineExtractor(TwitterExtractor): pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" example = "https://x.com/USER/timeline" + def _init_cursor(self): + if self._cursor: + return self._cursor.partition("/")[2] or None + return None + + def _update_cursor(self, cursor): + if cursor: + self._cursor = self._cursor_prefix + cursor + self.log.debug("Cursor: %s", self._cursor) + else: + self._cursor = None + return cursor + def tweets(self): - # yield initial batch of (media) tweets - tweet = None - for tweet in self._select_tweet_source()(self.user): - yield tweet - if tweet is None: - return + self._cursor = cursor = self.config("cursor") or None + reset = False - # build search query - query = "from:{} max_id:{}".format( - self._user["name"], tweet["rest_id"]) - if self.retweets: - query += " include:retweets include:nativeretweets" + if cursor: + state = cursor.partition("/")[0] + state, _, tweet_id = state.partition("_") + state = text.parse_int(state, 1) + else: + state = 1 + + if state <= 1: + self._cursor_prefix = "1/" - if not self.textonly: - # try to search for media-only tweets + # yield initial batch of (media) tweets tweet = None - for tweet in self.api.search_timeline(query + " filter:links"): + for tweet in self._select_tweet_source()(self.user): yield tweet - if tweet is not None: + if tweet is None and not cursor: return + tweet_id = tweet["rest_id"] + + state = reset = 2 + else: + self.api._user_id_by_screen_name(self.user) + + # build search query + query = "from:{} max_id:{}".format(self._user["name"], tweet_id) + if self.retweets: + query += " include:retweets include:nativeretweets" - # yield unfiltered search results - yield from self.api.search_timeline(query) + if state <= 2: + self._cursor_prefix = "2_{}/".format(tweet_id) + if reset: + self._cursor = self._cursor_prefix + + if not self.textonly: + # try to search for media-only tweets + tweet = None + for tweet in self.api.search_timeline(query + " filter:links"): + yield tweet + if tweet is not None: + return self._update_cursor(None) + + state = reset = 3 + + if state <= 3: + # yield unfiltered search results + self._cursor_prefix = "3_{}/".format(tweet_id) + if reset: + self._cursor = self._cursor_prefix + + yield from self.api.search_timeline(query) + return self._update_cursor(None) def _select_tweet_source(self): strategy = self.config("strategy") @@ -854,6 +924,24 @@ class TwitterQuotesExtractor(TwitterExtractor): yield Message.Queue, url, data +class TwitterInfoExtractor(TwitterExtractor): + """Extractor for a user's profile data""" + subcategory = "info" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/info" + example = "https://x.com/USER/info" + + def items(self): + api = TwitterAPI(self) + + screen_name = self.user + if screen_name.startswith("id:"): + user = api.user_by_rest_id(screen_name[3:]) + else: + user = api.user_by_screen_name(screen_name) + + return iter(((Message.Directory, self._transform_user(user)),)) + + class TwitterAvatarExtractor(TwitterExtractor): subcategory = "avatar" filename_fmt = "avatar {date}.{extension}" @@ -1388,7 +1476,11 @@ class TwitterAPI(): "%s %s (%s)", response.status_code, response.reason, errors) def _pagination_legacy(self, endpoint, params): - original_retweets = (self.extractor.retweets == "original") + extr = self.extractor + cursor = extr._init_cursor() + if cursor: + params["cursor"] = cursor + original_retweets = (extr.retweets == "original") bottom = ("cursor-bottom-", "sq-cursor-bottom") while True: @@ -1396,7 +1488,7 @@ class TwitterAPI(): instructions = data["timeline"]["instructions"] if not instructions: - return + return extr._update_cursor(None) tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] @@ -1477,8 +1569,8 @@ class TwitterAPI(): # stop on empty response if not cursor or (not tweets and not tweet_id): - return - params["cursor"] = cursor + return extr._update_cursor(None) + params["cursor"] = extr._update_cursor(cursor) def _pagination_tweets(self, endpoint, variables, path=None, stop_tweets=True, features=None): @@ -1487,6 +1579,9 @@ class TwitterAPI(): pinned_tweet = extr.pinned params = {"variables": None} + cursor = extr._init_cursor() + if cursor: + variables["cursor"] = cursor if features is None: features = self.features_pagination if features: @@ -1523,7 +1618,7 @@ class TwitterAPI(): cursor = entry["content"]["value"] if entries is None: if not cursor: - return + return extr._update_cursor(None) entries = () except LookupError: @@ -1672,12 +1767,16 @@ class TwitterAPI(): continue if stop_tweets and not tweet: - return + return extr._update_cursor(None) if not cursor or cursor == variables.get("cursor"): - return - variables["cursor"] = cursor + return extr._update_cursor(None) + variables["cursor"] = extr._update_cursor(cursor) def _pagination_users(self, endpoint, variables, path=None): + extr = self.extractor + cursor = extr._init_cursor() + if cursor: + variables["cursor"] = cursor params = { "variables": None, "features" : self._json_dumps(self.features_pagination), @@ -1697,7 +1796,7 @@ class TwitterAPI(): data = data[key] instructions = data["instructions"] except KeyError: - return + return extr._update_cursor(None) for instr in instructions: if instr["type"] == "TimelineAddEntries": @@ -1715,8 +1814,8 @@ class TwitterAPI(): cursor = entry["content"]["value"] if not cursor or cursor.startswith(("-1|", "0|")) or not entry: - return - variables["cursor"] = cursor + return extr._update_cursor(None) + variables["cursor"] = extr._update_cursor(cursor) def _handle_ratelimit(self, response): rl = self.extractor.config("ratelimit") @@ -1864,7 +1963,7 @@ def _login_impl(extr, username, password): }, } elif subtask == "LoginEnterAlternateIdentifierSubtask": - alt = extr.config("username_alt") or extr.input( + alt = extr.config("username-alt") or extr.input( "Alternate Identifier (username, email, phone number): ") data = { "enter_text": { diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 6dfb23c..5cde0d6 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -101,7 +101,8 @@ class VipergirlsExtractor(Extractor): class VipergirlsThreadExtractor(VipergirlsExtractor): """Extractor for vipergirls threads""" subcategory = "thread" - pattern = BASE_PATTERN + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?$" + pattern = (BASE_PATTERN + + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))") example = "https://vipergirls.to/threads/12345-TITLE" def __init__(self, match): diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index c112f4a..922a591 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -115,9 +115,28 @@ class VscoExtractor(Extractor): class VscoUserExtractor(VscoExtractor): - """Extractor for images from a user on vsco.co""" + """Extractor for a vsco user profile""" subcategory = "user" - pattern = USER_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" + pattern = USER_PATTERN + r"/?$" + example = "https://vsco.co/USER" + + def initialize(self): + pass + + def items(self): + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (VscoAvatarExtractor , base + "avatar"), + (VscoGalleryExtractor , base + "gallery"), + (VscoSpacesExtractor , base + "spaces"), + (VscoCollectionExtractor, base + "collection"), + ), ("gallery",)) + + +class VscoGalleryExtractor(VscoExtractor): + """Extractor for a vsco user's gallery""" + subcategory = "gallery" + pattern = USER_PATTERN + r"/(?:gallery|images)" example = "https://vsco.co/USER/gallery" def images(self): diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index faf3b0d..796f3f8 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -18,7 +18,7 @@ class WallpapercaveImageExtractor(Extractor): category = "wallpapercave" subcategory = "image" root = "https://wallpapercave.com" - pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com" + pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com/" example = "https://wallpapercave.com/w/wp12345" def items(self): @@ -40,3 +40,12 @@ class WallpapercaveImageExtractor(Extractor): image = text.nameext_from_url(path) yield Message.Directory, image yield Message.Url, self.root + path, image + + if path is None: + for wp in text.extract_iter( + page, 'class="wallpaper" id="wp', '</picture>'): + path = text.rextract(wp, ' src="', '"')[0] + if path: + image = text.nameext_from_url(path) + yield Message.Directory, image + yield Message.Url, self.root + path, image diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index e91f45f..61a36d5 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -64,7 +64,7 @@ class WarosuThreadExtractor(Extractor): def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "<span> File:" in post and self._extract_image(post, data): + if "<span class=fileinfo>" in post and self._extract_image(post, data): part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] @@ -83,7 +83,7 @@ class WarosuThreadExtractor(Extractor): def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("<span> File: ", ", ") + data["fsize"] = extr("<span class=fileinfo> File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") data["filename"] = text.unquote(extr( diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index fc61dff..126ef49 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -11,6 +11,8 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, util, exception +import collections +import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -76,22 +78,29 @@ class ZerochanExtractor(BooruExtractor): 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), - "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( - ">")[2] or None, + "source" : text.unescape(text.extr( + extr('id="source-url"', '</a>'), 'href="', '"')), } html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category = text.extr(tag, 'data-type="', '"') + category = text.extr(tag, '"', '"') name = text.extr(tag, 'data-tag="', '"') - tags.append(category.capitalize() + ":" + name) + tags.append(category.partition(" ")[0].capitalize() + ":" + name) return data def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - item = self.request(url).json() + text = self.request(url).text + try: + item = util.json_loads(text) + except ValueError as exc: + if " control character " not in str(exc): + raise + text = re.sub(r"[\x00-\x1f\x7f]", "", text) + item = util.json_loads(text) data = { "id" : item["id"], @@ -109,6 +118,14 @@ class ZerochanExtractor(BooruExtractor): return data + def _tags(self, post, page): + tags = collections.defaultdict(list) + for tag in post["tags"]: + category, _, name = tag.partition(":") + tags[category].append(name) + for key, value in tags.items(): + post["tags_" + key.lower()] = value + class ZerochanTagExtractor(ZerochanExtractor): subcategory = "tag" @@ -180,10 +197,16 @@ class ZerochanTagExtractor(ZerochanExtractor): static = "https://static.zerochan.net/.full." while True: - data = self.request(url, params=params).json() + response = self.request(url, params=params, allow_redirects=False) + if response.status_code >= 300: + url = text.urljoin(self.root, response.headers["location"]) + response = self.request(url, params=params) + data = response.json() + try: posts = data["items"] - except ValueError: + except Exception: + self.log.debug("Server response: %s", data) return if metadata: @@ -191,13 +214,13 @@ class ZerochanTagExtractor(ZerochanExtractor): post_id = post["id"] post.update(self._parse_entry_html(post_id)) post.update(self._parse_entry_api(post_id)) + yield post else: for post in posts: base = static + str(post["id"]) post["file_url"] = base + ".jpg" post["_fallback"] = (base + ".png",) - - yield from posts + yield post if not data.get("next"): return |
