diff options
Diffstat (limited to 'gallery_dl/extractor')
23 files changed, 363 insertions, 362 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 591e6a8..6aff1f3 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -62,6 +62,7 @@ modules = [ "hentaifox", "hentaihand", "hentaihere", + "hentainexus", "hiperdex", "hitomi", "hotleak", @@ -113,7 +114,6 @@ modules = [ "paheal", "patreon", "philomena", - "photobucket", "photovogue", "picarto", "piczel", diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py deleted file mode 100644 index a9ccab5..0000000 --- a/gallery_dl/extractor/cien.py +++ /dev/null @@ -1,86 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2024 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://ci-en.net/""" - -from .common import Extractor, Message -from .. import text, util - -BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" - - -class CienExtractor(Extractor): - category = "cien" - root = "https://ci-en.net" - - def __init__(self, match): - self.root = text.root_from_url(match.group(0)) - Extractor.__init__(self, match) - - def _pagination_articles(self, url, params): - data = {"extractor": CienArticleExtractor} - params["page"] = text.parse_int(params.get("page"), 1) - - while True: - page = self.request(url, params=params).text - - for card in text.extract_iter( - page, ' class="c-cardCase-item', '</div>'): - article_url = text.extr(card, ' href="', '"') - yield Message.Queue, article_url, data - - if ' rel="next"' not in page: - return - params["page"] += 1 - - -class CienArticleExtractor(CienExtractor): - subcategory = "article" - pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" - example = "https://ci-en.net/creator/123/article/12345" - - def items(self): - url = "{}/creator/{}/article/{}".format( - self.root, self.groups[0], self.groups[1]) - page = self.request(url, notfound="article").text - return - yield 1 - - -class CienCreatorExtractor(CienExtractor): - subcategory = "creator" - pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" - example = "https://ci-en.net/creator/123" - - def items(self): - url = "{}/creator/{}/article".format(self.root, self.groups[0]) - params = text.parse_query(self.groups[1]) - params["mode"] = "list" - return self._pagination_articles(url, params) - - -class CienRecentExtractor(CienExtractor): - subcategory = "recent" - pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" - example = "https://ci-en.net/mypage/recent" - - def items(self): - url = self.root + "/mypage/recent" - params = text.parse_query(self.groups[0]) - return self._pagination_articles(url, params) - - -class CienFollowingExtractor(CienExtractor): - subcategory = "following" - pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" - example = "https://ci-en.net/mypage/subscription" - - def items(self): - url = self.root + "/mypage/recent" - params = text.parse_query(self.groups[0]) - return self._pagination_articles(url, params) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 8771261..d7a41bc 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -11,6 +11,7 @@ import os import re import ssl +import sys import time import netrc import queue @@ -42,6 +43,7 @@ class Extractor(): browser = None request_interval = 0.0 request_interval_min = 0.0 + request_interval_429 = 60.0 request_timestamp = 0.0 def __init__(self, match): @@ -202,7 +204,9 @@ class Extractor(): self.log.warning("Cloudflare CAPTCHA") break - if code == 429 and self._interval_429: + if code == 429 and self._handle_429(response): + continue + elif code == 429 and self._interval_429: pass elif code not in retry_codes and code < 500: break @@ -230,6 +234,8 @@ class Extractor(): raise exception.HttpError(msg, response) + _handle_429 = util.false + def wait(self, seconds=None, until=None, adjust=1.0, reason="rate limit"): now = time.time() @@ -263,6 +269,8 @@ class Extractor(): time.sleep(seconds) def input(self, prompt, echo=True): + self._check_input_allowed(prompt) + if echo: try: return input(prompt) @@ -271,13 +279,30 @@ class Extractor(): else: return getpass.getpass(prompt) + def _check_input_allowed(self, prompt=""): + input = self.config("input") + + if input is None: + try: + input = sys.stdin.isatty() + except Exception: + input = False + + if not input: + raise exception.StopExtraction( + "User input required (%s)", prompt.strip(" :")) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") password = None if username: - password = self.config("password") or util.LazyPrompt() + password = self.config("password") + if not password: + self._check_input_allowed("password") + password = util.LazyPrompt() + elif self.config("netrc", False): try: info = netrc.netrc().authenticators(self.category) @@ -304,7 +329,7 @@ class Extractor(): self.request_interval_min, ) self._interval_429 = util.build_duration_func( - self.config("sleep-429", 60), + self.config("sleep-429", self.request_interval_429), ) if self._retries < 0: @@ -837,7 +862,7 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): if ssl_options or ssl_ciphers: ssl_context = urllib3.connection.create_urllib3_context( options=ssl_options or None, ciphers=ssl_ciphers) - if requests.__version__ > "2.31": + if not requests.__version__ < "2.32": # https://github.com/psf/requests/pull/6731 ssl_context.load_default_certs() ssl_context.check_hostname = False diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 993885a..2199cc8 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1730,15 +1730,16 @@ class DeviantartEclipseAPI(): url = "{}/{}/about".format(self.extractor.root, user) page = self.request(url).text - gruserid, pos = text.extract(page, ' data-userid="', '"') + gruser_id = text.extr(page, ' data-userid="', '"') - pos = page.find('\\"type\\":\\"watching\\"', pos) + pos = page.find('\\"name\\":\\"watching\\"') if pos < 0: - raise exception.NotFoundError("module") - moduleid = text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ') + raise exception.NotFoundError("'watching' module ID") + module_id = text.rextract( + page, '\\"id\\":', ',', pos)[0].strip('" ') self._fetch_csrf_token(page) - return gruserid, moduleid + return gruser_id, module_id def _fetch_csrf_token(self, page=None): if page is None: diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 2223403..d81fd0b 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -113,7 +113,17 @@ class FanboxExtractor(Extractor): post["user"] = self._get_user_data(post["creatorId"]) if self._meta_plan: plans = self._get_plan_data(post["creatorId"]) - post["plan"] = plans[post["feeRequired"]] + fee = post["feeRequired"] + try: + post["plan"] = plans[fee] + except KeyError: + fees = [f for f in plans if f >= fee] + if fees: + plan = plans[min(fees)] + else: + plan = plans[0].copy() + plan["fee"] = fee + post["plan"] = plans[fee] = plan return content_body, post diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py new file mode 100644 index 0000000..97b7844 --- /dev/null +++ b/gallery_dl/extractor/hentainexus.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019-2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentainexus.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import binascii + + +class HentainexusGalleryExtractor(GalleryExtractor): + """Extractor for hentainexus galleries""" + category = "hentainexus" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"/(?:view|read)/(\d+)") + example = "https://hentainexus.com/view/12345" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/view/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + rmve = text.remove_html + extr = text.extract_from(page) + data = { + "gallery_id": text.parse_int(self.gallery_id), + "cover" : extr('"og:image" content="', '"'), + "title" : extr('<h1 class="title">', '</h1>'), + } + + for key in ("Artist", "Book", "Circle", "Event", "Language", + "Magazine", "Parody", "Publisher", "Description"): + value = rmve(extr('viewcolumn">' + key + '</td>', '</td>')) + value, sep, rest = value.rpartition(" (") + data[key.lower()] = value if sep else rest + + data["tags"] = tags = [] + for k in text.extract_iter(page, '<a href="/?q=tag:', '"'): + tags.append(text.unquote(k).strip('"').replace("+", " ")) + + if not data["language"]: + data["language"] = "English" + data["lang"] = util.language_to_code(data["language"]) + + if "doujin" in data["tags"]: + data["type"] = "Doujinshi" + elif "illustration" in data["tags"]: + data["type"] = "Illustration" + else: + data["type"] = "Manga" + data["title_conventional"] = self._join_title(data) + return data + + def images(self, _): + url = "{}/read/{}".format(self.root, self.gallery_id) + page = self.request(url).text + imgs = util.json_loads(self._decode(text.extr( + page, 'initReader("', '"'))) + + headers = None + if not self.config("original", True): + headers = {"Accept": "image/webp,*/*"} + for img in imgs: + img["_http_headers"] = headers + + return [ + (img["image"], img) + for img in imgs + ] + + @staticmethod + def _decode(data): + # https://hentainexus.com/static/js/reader.min.js?r=22 + hostname = "hentainexus.com" + primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53) + blob = list(binascii.a2b_base64(data)) + for i in range(0, len(hostname)): + blob[i] = blob[i] ^ ord(hostname[i]) + + key = blob[0:64] + + C = 0 + for k in key: + C = C ^ k + for _ in range(8): + if C & 1: + C = C >> 1 ^ 0xc + else: + C = C >> 1 + k = primes[C & 0x7] + + x = 0 + S = list(range(256)) + for i in range(256): + x = (x + S[i] + key[i % len(key)]) % 256 + S[i], S[x] = S[x], S[i] + + result = "" + a = c = m = x = 0 + for n in range(64, len(blob)): + a = (a + k) % 256 + x = (c + S[(x + S[a]) % 256]) % 256 + c = (c + a + S[a]) % 256 + + S[a], S[x] = S[x], S[a] + m = S[(x + S[(a + S[(m + c) % 256]) % 256]) % 256] + result += chr(blob[n] ^ m) + + return result + + @staticmethod + def _join_title(data): + event = data['event'] + artist = data['artist'] + circle = data['circle'] + title = data['title'] + parody = data['parody'] + book = data['book'] + magazine = data['magazine'] + + # a few galleries have a large number of artists or parodies, + # which get replaced with "Various" in the title string + if artist.count(',') >= 3: + artist = 'Various' + if parody.count(',') >= 3: + parody = 'Various' + + jt = '' + if event: + jt += '({}) '.format(event) + if circle: + jt += '[{} ({})] '.format(circle, artist) + else: + jt += '[{}] '.format(artist) + jt += title + if parody.lower() != 'original work': + jt += ' ({})'.format(parody) + if book: + jt += ' ({})'.format(book) + if magazine: + jt += ' ({})'.format(magazine) + return jt + + +class HentainexusSearchExtractor(Extractor): + """Extractor for hentainexus search results""" + category = "hentainexus" + subcategory = "search" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$") + example = "https://hentainexus.com/?q=QUERY" + + def items(self): + params = text.parse_query(self.groups[0]) + data = {"_extractor": HentainexusGalleryExtractor} + path = "/" + + while path: + page = self.request(self.root + path, params=params).text + extr = text.extract_from(page) + + while True: + gallery_id = extr('<a href="/view/', '"') + if not gallery_id: + break + yield Message.Queue, self.root + "/view/" + gallery_id, data + + path = extr('class="pagination-next" href="', '"') diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 88f5708..9b74700 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -58,6 +58,7 @@ class HitomiGalleryExtractor(GalleryExtractor): return { "gallery_id": text.parse_int(info["id"]), "title" : info["title"], + "title_jpn" : info.get("japanese_title") or "", "type" : info["type"].capitalize(), "language" : language, "lang" : util.language_to_code(language), diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 9c2b1de..f7a5cc7 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -168,6 +168,7 @@ class InstagramExtractor(Extractor): "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), "date": text.parse_timestamp(post.get("taken_at")), + "liked": post.get("has_liked", False), } caption = post["caption"] @@ -270,6 +271,7 @@ class InstagramExtractor(Extractor): "typename" : typename, "date" : text.parse_timestamp(post["taken_at_timestamp"]), "likes" : post["edge_media_preview_like"]["count"], + "liked" : post.get("viewer_has_liked", False), "pinned" : pinned, "owner_id" : owner["id"], "username" : owner.get("username"), diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index b0c24de..6f2d5f3 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -518,7 +518,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): if not sort: sort = "updated" - users.sort(key=lambda x: x[sort], reverse=(order == "desc")) + users.sort(key=lambda x: x[sort] or util.NONE, + reverse=(order == "desc")) for user in users: user["_extractor"] = KemonopartyUserExtractor @@ -532,7 +533,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): if not sort: sort = "faved_seq" - posts.sort(key=lambda x: x[sort], reverse=(order == "desc")) + posts.sort(key=lambda x: x[sort] or util.NONE, + reverse=(order == "desc")) for post in posts: post["_extractor"] = KemonopartyPostExtractor diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 7ac3a3a..ecd6619 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache import itertools +import re class NewgroundsExtractor(Extractor): @@ -33,10 +34,16 @@ class NewgroundsExtractor(Extractor): def _init(self): self.flash = self.config("flash", True) - fmt = self.config("format", "original") - self.format = (True if not fmt or fmt == "original" else - fmt if isinstance(fmt, int) else - text.parse_int(fmt.rstrip("p"))) + fmt = self.config("format") + if not fmt or fmt == "original": + self.format = ("mp4", "webm", "m4v", "mov", "mkv", + 1080, 720, 360) + elif isinstance(fmt, (list, tuple)): + self.format = fmt + else: + self._video_formats = self._video_formats_limit + self.format = (fmt if isinstance(fmt, int) else + text.parse_int(fmt.rstrip("p"))) def items(self): self.login() @@ -266,7 +273,7 @@ class NewgroundsExtractor(Extractor): if src: src = src.replace("\\/", "/") - fallback = () + formats = () date = text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')) else: @@ -276,23 +283,8 @@ class NewgroundsExtractor(Extractor): "X-Requested-With": "XMLHttpRequest", } sources = self.request(url, headers=headers).json()["sources"] - - if self.format is True: - src = sources["360p"][0]["src"].replace(".360p.", ".") - formats = sources - else: - formats = [] - for fmt, src in sources.items(): - width = text.parse_int(fmt.rstrip("p")) - if width <= self.format: - formats.append((width, src)) - if formats: - formats.sort(reverse=True) - src, formats = formats[0][1][0]["src"], formats[1:] - else: - src = "" - - fallback = self._video_fallback(formats) + formats = self._video_formats(sources) + src = next(formats, "") date = text.parse_timestamp(src.rpartition("?")[2]) return { @@ -306,15 +298,33 @@ class NewgroundsExtractor(Extractor): "rating" : extr('class="rated-', '"'), "index" : text.parse_int(index), "_index" : index, - "_fallback" : fallback, + "_fallback" : formats, } - @staticmethod - def _video_fallback(formats): - if isinstance(formats, dict): - formats = list(formats.items()) - formats.sort(key=lambda fmt: text.parse_int(fmt[0].rstrip("p")), - reverse=True) + def _video_formats(self, sources): + src = sources["360p"][0]["src"] + sub = re.compile(r"\.360p\.\w+").sub + + for fmt in self.format: + try: + if isinstance(fmt, int): + yield sources[str(fmt) + "p"][0]["src"] + elif fmt in sources: + yield sources[fmt][0]["src"] + else: + yield sub("." + fmt, src, 1) + except Exception as exc: + self.log.debug("Video format '%s' not available (%s: %s)", + fmt, exc.__class__.__name__, exc) + + def _video_formats_limit(self, sources): + formats = [] + for fmt, src in sources.items(): + width = text.parse_int(fmt.rstrip("p")) + if width <= self.format: + formats.append((width, src)) + + formats.sort(reverse=True) for fmt in formats: yield fmt[1][0]["src"] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index c50c013..60cca22 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -56,7 +56,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): data["user_id"] = data["artist_id"] data["user_name"] = data["artist_name"] - urls = list(self._extract_images(image_id, page)) + urls = self._extract_images(image_id, page) data["count"] = len(urls) yield Message.Directory, data @@ -113,11 +113,14 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): # multiple images url = "{}/view_popup.php?id={}".format(self.root, image_id) page = self.request(url).text - yield from text.extract_iter( - page, 'href="javascript:void(0);"><img src="', '"') + return [ + text.extr(media, ' src="', '"') + for media in text.extract_iter( + page, 'href="javascript:void(0);"><', '>') + ] else: pos = page.find('id="view-center"') + 1 - yield text.extract(page, 'itemprop="image" src="', '"', pos)[0] + return (text.extr(page, 'itemprop="image" src="', '"', pos),) @staticmethod def _extract_user_name(page): diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 2bce597..cfc8861 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -234,26 +234,6 @@ class NitterExtractor(BaseExtractor): BASE_PATTERN = NitterExtractor.update({ - "nitter.net": { - "root": "https://nitter.net", - "pattern": r"nitter\.net", - }, - "nitter.1d4.us": { - "root": "https://nitter.1d4.us", - "pattern": r"nitter\.1d4\.us", - }, - "nitter.kavin.rocks": { - "root": "https://nitter.kavin.rocks", - "pattern": r"nitter\.kavin\.rocks", - }, - "nitter.unixfox.eu": { - "root": "https://nitter.unixfox.eu", - "pattern": r"nitter\.unixfox\.eu", - }, - "nitter.it": { - "root": "https://nitter.it", - "pattern": r"nitter\.it", - }, }) USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)" diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 5571575..9d025d5 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -424,7 +424,7 @@ class OAuthPixiv(OAuthBase): "code_challenge_method": "S256", "client": "pixiv-android", } - code = self.open(url, params, self._input) + code = self.open(url, params, self._input_code) url = "https://oauth.secure.pixiv.net/auth/token" headers = { @@ -459,7 +459,7 @@ class OAuthPixiv(OAuthBase): stdout_write(self._generate_message(("refresh-token",), (token,))) - def _input(self): + def _input_code(self): stdout_write("""\ 1) Open your browser's Developer Tools (F12) and switch to the Network tab 2) Login @@ -471,5 +471,5 @@ class OAuthPixiv(OAuthBase): like the entire URL or several query parameters. """) - code = input("code: ") + code = self.input("code: ") return code.rpartition("=")[2].strip() diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 339646f..150efed 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -24,8 +24,13 @@ class PhilomenaExtractor(BooruExtractor): def _init(self): self.api = PhilomenaAPI(self) + if not self.config("svg", True): + self._file_url = operator.itemgetter("view_url") - _file_url = operator.itemgetter("view_url") + def _file_url(self, post): + if post["format"] == "svg": + return post["view_url"].rpartition(".")[0] + ".svg" + return post["view_url"] @staticmethod def _prepare(post): diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py deleted file mode 100644 index a01c9fe..0000000 --- a/gallery_dl/extractor/photobucket.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://photobucket.com/""" - -from .common import Extractor, Message -from .. import text, exception -import binascii -import json - - -class PhotobucketAlbumExtractor(Extractor): - """Extractor for albums on photobucket.com""" - category = "photobucket" - subcategory = "album" - directory_fmt = ("{category}", "{username}", "{location}") - filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}" - archive_fmt = "{id}" - pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)" - r"/user/[^/?&#]+/library(?:/[^?&#]*)?") - example = "https://s123.photobucket.com/user/USER/library" - - def __init__(self, match): - self.root = "https://" + match.group(1) - Extractor.__init__(self, match) - - def _init(self): - self.session.headers["Referer"] = self.url - - def items(self): - for image in self.images(): - image["titleOrFilename"] = text.unescape(image["titleOrFilename"]) - image["title"] = text.unescape(image["title"]) - image["extension"] = image["ext"] - yield Message.Directory, image - yield Message.Url, image["fullsizeUrl"], image - - if self.config("subalbums", True): - for album in self.subalbums(): - album["_extractor"] = PhotobucketAlbumExtractor - yield Message.Queue, album["url"], album - - def images(self): - """Yield all images of the current album""" - url = self.url - params = {"sort": "3", "page": 1} - - while True: - page = self.request(url, params=params).text - json_data = text.extract(page, "collectionData:", ",\n")[0] - if not json_data: - msg = text.extr(page, 'libraryPrivacyBlock">', "</div>") - msg = ' ("{}")'.format(text.remove_html(msg)) if msg else "" - self.log.error("Unable to get JSON data%s", msg) - return - data = json.loads(json_data) - - yield from data["items"]["objects"] - - if data["total"] <= data["offset"] + data["pageSize"]: - self.album_path = data["currentAlbumPath"] - return - params["page"] += 1 - - def subalbums(self): - """Return all subalbum objects""" - url = self.root + "/component/Albums-SubalbumList" - params = { - "albumPath": self.album_path, - "fetchSubAlbumsOnly": "true", - "deferCollapsed": "true", - "json": "1", - } - - data = self.request(url, params=params).json() - return data["body"].get("subAlbums", ()) - - -class PhotobucketImageExtractor(Extractor): - """Extractor for individual images from photobucket.com""" - category = "photobucket" - subcategory = "image" - directory_fmt = ("{category}", "{username}") - filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}" - archive_fmt = "{username}_{id}" - pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com" - r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)" - r"|/user/([^/?&#]+)/media/[^?&#]+\.html)") - example = "https://s123.photobucket.com/user/USER/media/NAME.EXT.html" - - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) or match.group(3) - self.media_id = match.group(2) - - def _init(self): - self.session.headers["Referer"] = self.url - - def items(self): - url = "https://photobucket.com/galleryd/search.php" - params = {"userName": self.user, "searchTerm": "", "ref": ""} - - if self.media_id: - params["mediaId"] = self.media_id - else: - params["url"] = self.url - - # retry API call up to 5 times, since it can randomly fail - tries = 0 - while tries < 5: - data = self.request(url, method="POST", params=params).json() - image = data["mediaDocuments"] - if "message" not in image: - break # success - tries += 1 - self.log.debug(image["message"]) - else: - raise exception.StopExtraction(image["message"]) - - # adjust metadata entries to be at least somewhat similar - # to what the 'album' extractor provides - if "media" in image: - image = image["media"][image["mediaIndex"]] - image["albumView"] = data["mediaDocuments"]["albumView"] - image["username"] = image["ownerId"] - else: - image["fileUrl"] = image.pop("imageUrl") - - image.setdefault("title", "") - image.setdefault("description", "") - name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".") - image["ext"] = image["extension"] = ext - image["titleOrFilename"] = image["title"] or name - image["tags"] = image.pop("clarifaiTagList", []) - - mtype, _, mid = binascii.a2b_base64(image["id"]).partition(b":") - image["pictureId"] = mid.decode() if mtype == b"mediaId" else "" - - yield Message.Directory, image - yield Message.Url, image["fileUrl"], image diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 67f38c4..a68f0db 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -92,6 +92,10 @@ BASE_PATTERN = Shimmie2Extractor.update({ "root": "https://rule34hentai.net", "pattern": r"rule34hentai\.net", }, + "vidyapics": { + "root": "https://vidya.pics", + "pattern": r"vidya\.pics", + }, }) + r"/(?:index\.php\?q=/?)?" diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 38a2d16..6ec44ba 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -7,7 +7,7 @@ """Extractors for https://skeb.jp/""" from .common import Extractor, Message -from .. import text, exception +from .. import text import itertools @@ -31,14 +31,15 @@ class SkebExtractor(Extractor): if "Authorization" not in self.session.headers: self.headers["Authorization"] = "Bearer null" - def request(self, url, **kwargs): - while True: - try: - return Extractor.request(self, url, **kwargs) - except exception.HttpError as exc: - if exc.status == 429 and "request_key" in exc.response.cookies: - continue - raise + def _handle_429(self, response): + if "request_key" in response.cookies: + return True + + request_key = text.extr( + response.text, "request_key=", ";") + if request_key: + self.cookies.set("request_key", request_key, domain="skeb.jp") + return True def items(self): metadata = self.metadata() diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py index e44fdae..3210fd8 100644 --- a/gallery_dl/extractor/speakerdeck.py +++ b/gallery_dl/extractor/speakerdeck.py @@ -8,45 +8,35 @@ """Extractors for https://speakerdeck.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor from .. import text +import re -class SpeakerdeckPresentationExtractor(Extractor): +class SpeakerdeckPresentationExtractor(GalleryExtractor): """Extractor for images from a presentation on speakerdeck.com""" category = "speakerdeck" subcategory = "presentation" directory_fmt = ("{category}", "{user}") filename_fmt = "{presentation}-{num:>02}.{extension}" archive_fmt = "{presentation}_{num}" + root = "https://speakerdeck.com" pattern = r"(?:https?://)?(?:www\.)?speakerdeck\.com/([^/?#]+)/([^/?#]+)" example = "https://speakerdeck.com/USER/PRESENTATION" def __init__(self, match): - Extractor.__init__(self, match) + GalleryExtractor.__init__(self, match, "") self.user, self.presentation = match.groups() - self.presentation_id = None - - def items(self): - data = self.get_job_metadata() - imgs = self.get_image_urls() - data["count"] = len(imgs) - yield Message.Directory, data - for data["num"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self): - """Collect metadata for extractor-job""" - url = "https://speakerdeck.com/oembed.json" + + def metadata(self, _): + url = self.root + "/oembed.json" params = { - "url": "https://speakerdeck.com/" + self.user + - "/" + self.presentation, + "url": "{}/{}/{}".format(self.root, self.user, self.presentation), } - data = self.request(url, params=params).json() - self.presentation_id, pos = \ - text.extract(data["html"], 'src="//speakerdeck.com/player/', '"') + self.presentation_id = text.extr( + data["html"], 'src="//speakerdeck.com/player/', '"') return { "user": self.user, @@ -56,8 +46,10 @@ class SpeakerdeckPresentationExtractor(Extractor): "author": data["author_name"], } - def get_image_urls(self): - """Extract and return a list of all image-urls""" - page = self.request("https://speakerdeck.com/player/" + - self.presentation_id).text - return list(text.extract_iter(page, 'js-sd-slide" data-url="', '"')) + def images(self, _): + url = "{}/player/{}".format(self.root, self.presentation_id) + page = re.sub(r"\s+", " ", self.request(url).text) + return [ + (url, None) + for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"') + ] diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 08cccab..bba1ece 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -98,13 +98,13 @@ class SzurubooruTagExtractor(SzurubooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}_{version}" - pattern = BASE_PATTERN + r"/posts/query=([^/?#]+)" + pattern = BASE_PATTERN + r"/posts(?:/query=([^/?#]*))?" example = "https://booru.foalcon.com/posts/query=TAG" def __init__(self, match): SzurubooruExtractor.__init__(self, match) - query = match.group(match.lastindex) - self.query = text.unquote(query.replace("+", " ")) + query = self.groups[-1] + self.query = text.unquote(query.replace("+", " ")) if query else "" def metadata(self): return {"search_tags": self.query} @@ -119,9 +119,5 @@ class SzurubooruPostExtractor(SzurubooruExtractor): pattern = BASE_PATTERN + r"/post/(\d+)" example = "https://booru.foalcon.com/post/12345" - def __init__(self, match): - SzurubooruExtractor.__init__(self, match) - self.post_id = match.group(match.lastindex) - def posts(self): - return (self._api_request("/post/" + self.post_id),) + return (self._api_request("/post/" + self.groups[-1]),) diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py index de6f3ee..71431ad 100644 --- a/gallery_dl/extractor/tcbscans.py +++ b/gallery_dl/extractor/tcbscans.py @@ -4,19 +4,23 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://tcbscans.com/""" +"""Extractors for https://tcbscans.me/""" from .common import ChapterExtractor, MangaExtractor from .. import text -BASE_PATTERN = r"(?:https?://)?(?:tcbscans|onepiecechapters)\.com" +BASE_PATTERN = (r"(?:https?://)?(?:tcb(?:-backup\.bihar-mirchi|scans)" + r"|onepiecechapters)\.(?:com|me)") class TcbscansChapterExtractor(ChapterExtractor): category = "tcbscans" - root = "https://tcbscans.com" pattern = BASE_PATTERN + r"(/chapters/\d+/[^/?#]+)" - example = "https://tcbscans.com/chapters/12345/MANGA-chapter-123" + example = "https://tcbscans.me/chapters/12345/MANGA-chapter-123" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + ChapterExtractor.__init__(self, match) def images(self, page): return [ @@ -39,10 +43,13 @@ class TcbscansChapterExtractor(ChapterExtractor): class TcbscansMangaExtractor(MangaExtractor): category = "tcbscans" - root = "https://tcbscans.com" chapterclass = TcbscansChapterExtractor pattern = BASE_PATTERN + r"(/mangas/\d+/[^/?#]+)" - example = "https://tcbscans.com/mangas/123/MANGA" + example = "https://tcbscans.me/mangas/123/MANGA" + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + MangaExtractor.__init__(self, match) def chapters(self, page): data = { diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index f57f479..a725a2c 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -28,8 +28,13 @@ class TwibooruExtractor(BooruExtractor): def _init(self): self.api = TwibooruAPI(self) + if not self.config("svg", True): + self._file_url = operator.itemgetter("view_url") - _file_url = operator.itemgetter("view_url") + def _file_url(self, post): + if post["format"] == "svg": + return post["view_url"].rpartition(".")[0] + ".svg" + return post["view_url"] @staticmethod def _prepare(post): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ff77828..ec098aa 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -36,6 +36,7 @@ class TwitterExtractor(Extractor): self.user = match.group(1) def _init(self): + self.unavailable = self.config("unavailable", False) self.textonly = self.config("text-tweets", False) self.retweets = self.config("retweets", False) self.replies = self.config("replies", True) @@ -143,6 +144,15 @@ class TwitterExtractor(Extractor): def _extract_media(self, tweet, entities, files): for media in entities: + + if "ext_media_availability" in media: + ext = media["ext_media_availability"] + if ext.get("status") == "Unavailable": + self.log.warning("Media unavailable (%s - '%s')", + tweet["id_str"], ext.get("reason")) + if not self.unavailable: + continue + descr = media.get("ext_alt_text") width = media["original_info"].get("width", 0) height = media["original_info"].get("height", 0) @@ -1709,11 +1719,16 @@ class TwitterAPI(): variables["cursor"] = cursor def _handle_ratelimit(self, response): - if self.extractor.config("ratelimit") == "abort": + rl = self.extractor.config("ratelimit") + if rl == "abort": raise exception.StopExtraction("Rate limit exceeded") - - until = response.headers.get("x-rate-limit-reset") - self.extractor.wait(until=until, seconds=None if until else 60) + elif rl and isinstance(rl, str) and rl.startswith("wait:"): + until = None + seconds = text.parse_float(rl.partition(":")[2]) or 60.0 + else: + until = response.headers.get("x-rate-limit-reset") + seconds = None if until else 60.0 + self.extractor.wait(until=until, seconds=seconds) def _process_tombstone(self, entry, tombstone): text = (tombstone.get("richText") or tombstone["text"])["text"] @@ -1849,7 +1864,7 @@ def _login_impl(extr, username, password): }, } elif subtask == "LoginEnterAlternateIdentifierSubtask": - alt = extr.input( + alt = extr.config("username_alt") or extr.input( "Alternate Identifier (username, email, phone number): ") data = { "enter_text": { @@ -1881,8 +1896,9 @@ def _login_impl(extr, username, password): raise exception.AuthenticationError("Login requires CAPTCHA") elif subtask == "DenyLoginSubtask": raise exception.AuthenticationError("Login rejected as suspicious") - elif subtask == "ArkoseLogin": - raise exception.AuthenticationError("No auth token cookie") + elif subtask == "LoginSuccessSubtask": + raise exception.AuthenticationError( + "No 'auth_token' cookie received") else: raise exception.StopExtraction("Unrecognized subtask %s", subtask) diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py index 79d7916..654c451 100644 --- a/gallery_dl/extractor/vichan.py +++ b/gallery_dl/extractor/vichan.py @@ -22,10 +22,6 @@ BASE_PATTERN = VichanExtractor.update({ "root": "https://8kun.top", "pattern": r"8kun\.top", }, - "wikieat": { - "root": "https://wikieat.club", - "pattern": r"wikieat\.club", - }, "smugloli": { "root": None, "pattern": r"smuglo(?:\.li|li\.net)", |
