diff options
| author | 2023-04-30 16:45:21 -0400 | |
|---|---|---|
| committer | 2023-04-30 16:45:21 -0400 | |
| commit | 33d4eae5a6df8aaf6757f52ae25f514ff1211c62 (patch) | |
| tree | 7ad425b022dcc1daea1c84c720a266f0134db705 /gallery_dl | |
| parent | f98ab7aaca3c4acbd5a793267791749740330e9c (diff) | |
New upstream version 1.25.3.upstream/1.25.3
Diffstat (limited to 'gallery_dl')
25 files changed, 931 insertions, 140 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 88e86e9..4ec0398 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -44,6 +44,12 @@ class HttpDownloader(DownloaderBase): self.mtime = self.config("mtime", True) self.rate = self.config("rate") + if not self.config("consume-content", False): + # this resets the underlying TCP connection, and therefore + # if the program makes another request to the same domain, + # a new connection (either TLS or plain TCP) must be made + self.release_conn = lambda resp: resp.close() + if self.retries < 0: self.retries = float("inf") if self.minsize: @@ -106,7 +112,7 @@ class HttpDownloader(DownloaderBase): while True: if tries: if response: - response.close() + self.release_conn(response) response = None self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: @@ -165,18 +171,24 @@ class HttpDownloader(DownloaderBase): retry = kwdict.get("_http_retry") if retry and retry(response): continue + self.release_conn(response) self.log.warning(msg) return False # check for invalid responses validate = kwdict.get("_http_validate") if validate and self.validate: - result = validate(response) + try: + result = validate(response) + except Exception: + self.release_conn(response) + raise if isinstance(result, str): url = result tries -= 1 continue if not result: + self.release_conn(response) self.log.warning("Invalid response") return False @@ -184,11 +196,13 @@ class HttpDownloader(DownloaderBase): size = text.parse_int(size, None) if size is not None: if self.minsize and size < self.minsize: + self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) return False if self.maxsize and size > self.maxsize: + self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) @@ -280,6 +294,18 @@ class HttpDownloader(DownloaderBase): return True + def release_conn(self, response): + """Release connection back to pool by consuming response body""" + try: + for _ in response.iter_content(self.chunk_size): + pass + except (RequestException, SSLError, OpenSSLError) as exc: + print() + self.log.debug( + "Unable to consume response body (%s: %s); " + "closing the connection anyway", exc.__class__.__name__, exc) + response.close() + @staticmethod def receive(fp, content, bytes_total, bytes_start): write = fp.write diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index d9674d8..f142690 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -4,35 +4,46 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://2chen.moe/""" +"""Extractors for https://sturdychan.help/""" from .common import Extractor, Message from .. import text +BASE_PATTERN = r"(?:https?://)?(?:sturdychan.help|2chen\.(?:moe|club))" + class _2chenThreadExtractor(Extractor): """Extractor for 2chen threads""" category = "2chen" subcategory = "thread" + root = "https://sturdychan.help" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time} {filename}.{extension}" archive_fmt = "{board}_{thread}_{hash}_{time}" - pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)" test = ( - ("https://2chen.moe/tv/496715", { - "pattern": r"https://2chen\.su/assets/images/src/\w{40}\.\w+$", + ("https://sturdychan.help/tv/268929", { + "pattern": r"https://sturdychan\.help/assets/images" + r"/src/\w{40}\.\w+$", "count": ">= 179", + "keyword": { + "board": "tv", + "date": "type:datetime", + "hash": r"re:[0-9a-f]{40}", + "name": "Anonymous", + "no": r"re:\d+", + "thread": "268929", + "time": int, + "title": "「/ttg/ #118: 🇧🇷 edition」", + "url": str, + }, }), - ("https://2chen.club/tv/1", { - "count": 5, - }), - # 404 + ("https://2chen.club/tv/1"), ("https://2chen.moe/jp/303786"), ) def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) self.board, self.thread = match.groups() def items(self): @@ -88,9 +99,10 @@ class _2chenBoardExtractor(Extractor): """Extractor for 2chen boards""" category = "2chen" subcategory = "board" - pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)(?:/catalog|/?$)" + root = "https://sturdychan.help" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/catalog|/?$)" test = ( - ("https://2chen.moe/co/", { + ("https://sturdychan.help/co/", { "pattern": _2chenThreadExtractor.pattern }), ("https://2chen.moe/co"), @@ -100,7 +112,6 @@ class _2chenBoardExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) self.board = match.group(1) def items(self): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 553a110..9841ca7 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -74,6 +74,7 @@ modules = [ "instagram", "issuu", "itaku", + "itchio", "kabeuchi", "keenspot", "kemonoparty", @@ -93,7 +94,6 @@ modules = [ "mangapark", "mangasee", "mangoxo", - "mememuseum", "misskey", "myhentaigallery", "myportfolio", @@ -133,6 +133,7 @@ modules = [ "seiga", "senmanga", "sexcom", + "shimmie2", "simplyhentai", "skeb", "slickpic", @@ -156,6 +157,7 @@ modules = [ "urlshortener", "vanillarock", "vichan", + "vipergirls", "vk", "vsco", "wallhaven", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 1469aad..d8cc51d 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -81,10 +81,13 @@ class BehanceGalleryExtractor(BehanceExtractor): ("https://www.behance.net/gallery/88276087/Audi-R8-RWD", { "count": 20, "url": "6bebff0d37f85349f9ad28bd8b76fd66627c1e2f", + "pattern": r"https://mir-s3-cdn-cf\.behance\.net/project_modules" + r"/source/[0-9a-f]+.[0-9a-f]+\.jpg" }), # 'video' modules (#1282) ("https://www.behance.net/gallery/101185577/COLCCI", { - "pattern": r"ytdl:https://cdn-prod-ccv\.adobe\.com/", + "pattern": r"https://cdn-prod-ccv\.adobe\.com/\w+" + r"/rend/\w+_720\.mp4\?", "count": 3, }), ) @@ -129,26 +132,35 @@ class BehanceGalleryExtractor(BehanceExtractor): append = result.append for module in data["modules"]: - mtype = module["type"] + mtype = module["__typename"] - if mtype == "image": - url = module["sizes"]["original"] + if mtype == "ImageModule": + url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "video": - page = self.request(module["src"]).text - url = text.extr(page, '<source src="', '"') - if text.ext_from_url(url) == "m3u8": - url = "ytdl:" + url + elif mtype == "VideoModule": + renditions = module["videoData"]["renditions"] + try: + url = [ + r["url"] for r in renditions + if text.ext_from_url(r["url"]) != "m3u8" + ][-1] + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + url = "ytdl:" + renditions[-1]["url"] append((url, module)) - elif mtype == "media_collection": + elif mtype == "MediaCollectionModule": for component in module["components"]: - url = component["sizes"]["source"] - append((url, module)) - - elif mtype == "embed": - embed = module.get("original_embed") or module.get("embed") + for size in component["imageSizes"].values(): + if size: + parts = size["url"].split("/") + parts[4] = "source" + append(("/".join(parts), module)) + break + + elif mtype == "EmbedModule": + embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: append(("ytdl:" + text.extr(embed, 'src="', '"'), module)) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index f532a97..18d9867 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1417,7 +1417,14 @@ class DeviantartOAuthAPI(): """Get the original file download (if allowed)""" endpoint = "/deviation/download/" + deviation_id params = {"mature_content": self.mature} - return self._call(endpoint, params=params, public=public) + + try: + return self._call( + endpoint, params=params, public=public, log=False) + except Exception: + if not self.refresh_token_key: + raise + return self._call(endpoint, params=params, public=False) def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" @@ -1518,7 +1525,7 @@ class DeviantartOAuthAPI(): refresh_token_key, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, fatal=True, public=None, **kwargs): + def _call(self, endpoint, fatal=True, log=True, public=None, **kwargs): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2" + endpoint kwargs["fatal"] = None @@ -1563,7 +1570,8 @@ class DeviantartOAuthAPI(): "cs/configuration.rst#extractordeviantartclient-id" "--client-secret") else: - self.log.error(msg) + if log: + self.log.error(msg) return data def _pagination(self, endpoint, params, @@ -1571,15 +1579,14 @@ class DeviantartOAuthAPI(): warn = True if public is None: public = self.public - elif not public: - self.public = False while True: data = self._call(endpoint, params=params, public=public) - if key not in data: + try: + results = data[key] + except KeyError: self.log.error("Unexpected API response: %s", data) return - results = data[key] if unpack: results = [item["journal"] for item in results @@ -1588,7 +1595,7 @@ class DeviantartOAuthAPI(): if public and len(results) < params["limit"]: if self.refresh_token_key: self.log.debug("Switching to private access token") - self.public = public = False + public = False continue elif data["has_more"] and warn: warn = False diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 497f1ef..c91347e 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -49,14 +49,16 @@ class ImagefapGalleryExtractor(ImagefapExtractor): ("https://www.imagefap.com/gallery/7102714", { "pattern": r"https://cdnh?\.imagefap\.com" r"/images/full/\d+/\d+/\d+\.jpg", - "keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3", + "keyword": "bdcb75b1e4b9dddc718f3d66e1a58afa9d81a38b", "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab", }), ("https://www.imagefap.com/gallery/7876223", { "pattern": r"https://cdnh?\.imagefap\.com" r"/images/full/\d+/\d+/\d+\.jpg", "keyword": { + "categories": ["Asses", "Softcore", "Pornstars"], "count": 44, + "description": "", "gallery_id": 7876223, "image_id": int, "num": int, @@ -67,6 +69,21 @@ class ImagefapGalleryExtractor(ImagefapExtractor): }, "count": 44, }), + # description (#3905) + ("https://www.imagefap.com/gallery/6180555", { + "range": "1", + "keyword": { + "categories": ["Amateur", "Softcore", "Homemade"], + "count": 36, + "description": "Nude and dressed sluts showing off the goods", + "gallery_id": 6180555, + "image_id": int, + "num": int, + "tags": [] , + "title": "Dressed or Undressed MG*", + "uploader": "splitopen", + }, + }), ("https://www.imagefap.com/pictures/7102714"), ("https://www.imagefap.com/gallery.php?gid=7102714"), ("https://beta.imagefap.com/gallery.php?gid=7102714"), @@ -92,9 +109,14 @@ class ImagefapGalleryExtractor(ImagefapExtractor): data = { "gallery_id": text.parse_int(self.gid), - "tags": extr('name="keywords" content="', '"').split(", "), "uploader": extr("porn picture gallery by ", " to see hottest"), "title": text.unescape(extr("<title>", "<")), + "description": text.unescape(extr( + 'id="gdesc_text"', '<').partition(">")[2]), + "categories": text.split_html(extr( + 'id="cnt_cats"', '</div>'))[1::2], + "tags": text.split_html(extr( + 'id="cnt_tags"', '</div>'))[1::2], "count": text.parse_int(extr(' 1 of ', ' pics"')), } diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d57ec89..df4ff26 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -135,6 +135,29 @@ class ImxtoImageExtractor(ImagehostImageExtractor): } +class ImxtoGalleryExtractor(ImagehostImageExtractor): + """Extractor for image galleries from imx.to""" + category = "imxto" + subcategory = "gallery" + pattern = r"(?:https?://)?(?:www\.)?(imx\.to/g/([^/?#]+))" + test = ("https://imx.to/g/ozdy", { + "pattern": ImxtoImageExtractor.pattern, + "keyword": {"title": "untitled gallery"}, + "count": 40, + }) + + def items(self): + page = self.request(self.page_url).text + title, pos = text.extract(page, '<div class="title', '<') + data = { + "_extractor": ImxtoImageExtractor, + "title": text.unescape(title.partition(">")[2]).strip(), + } + + for url in text.extract_iter(page, "<a href=", " ", pos): + yield Message.Queue, url.strip("\"'"), data + + class AcidimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from acidimg.cc""" category = "acidimg" diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 42d0a7b..f8f1600 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -347,8 +347,8 @@ class ImgurAPI(): def __init__(self, extractor): self.extractor = extractor self.headers = { - "Authorization": "Client-ID " + extractor.config( - "client-id", "546c25a59c58ad7"), + "Authorization": "Client-ID " + ( + extractor.config("client-id") or "546c25a59c58ad7"), } def account_favorites(self, account): diff --git a/gallery_dl/extractor/itchio.py b/gallery_dl/extractor/itchio.py new file mode 100644 index 0000000..6034d12 --- /dev/null +++ b/gallery_dl/extractor/itchio.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://itch.io/""" + +from .common import Extractor, Message +from .. import text + + +class ItchioGameExtractor(Extractor): + """Extractor for itch.io games""" + category = "itchio" + subcategory = "game" + root = "https://itch.io" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{game[title]} ({id}).{extension}" + archive_fmt = "{id}" + pattern = r"(?:https?://)?(\w+).itch\.io/([\w-]+)" + test = ( + ("https://sirtartarus.itch.io/a-craft-of-mine", { + "pattern": r"https://\w+\.ssl\.hwcdn\.net/upload2" + r"/game/1983311/7723751\?", + "count": 1, + "keyword": { + "extension": "", + "filename": "7723751", + "game": { + "id": 1983311, + "noun": "game", + "title": "A Craft Of Mine", + "url": "https://sirtartarus.itch.io/a-craft-of-mine", + }, + "user": { + "id": 4060052, + "name": "SirTartarus", + "url": "https://sirtartarus.itch.io", + }, + }, + }), + ) + + def __init__(self, match): + self.user, self.slug = match.groups() + Extractor.__init__(self, match) + + def items(self): + game_url = "https://{}.itch.io/{}".format(self.user, self.slug) + page = self.request(game_url).text + + params = { + "source": "view_game", + "as_props": "1", + "after_download_lightbox": "true", + } + headers = { + "Referer": game_url, + "X-Requested-With": "XMLHttpRequest", + "Origin": "https://{}.itch.io".format(self.user), + } + data = { + "csrf_token": text.unquote(self.session.cookies["itchio_token"]), + } + + for upload_id in text.extract_iter(page, 'data-upload_id="', '"'): + file_url = "{}/file/{}".format(game_url, upload_id) + info = self.request(file_url, method="POST", params=params, + headers=headers, data=data).json() + + game = info["lightbox"]["game"] + user = info["lightbox"]["user"] + game["url"] = game_url + user.pop("follow_button", None) + game = {"game": game, "user": user, "id": upload_id} + + url = info["url"] + yield Message.Directory, game + yield Message.Url, url, text.nameext_from_url(url, game) diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 5ba18a3..6fd9f49 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -16,21 +16,26 @@ BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" class ManganeloBase(): category = "manganelo" root = "https://chapmanganato.com" + _match_chapter = None def __init__(self, match): domain, path = match.groups() super().__init__(match, "https://" + domain + path) self.session.headers['Referer'] = self.root - self._match_chapter = re.compile( - r"(?:[Vv]ol\.?\s*(\d+)\s?)?" - r"[Cc]hapter\s*([^:]+)" - r"(?::\s*(.+))?").match + if self._match_chapter is None: + ManganeloBase._match_chapter = re.compile( + r"(?:[Vv]ol\.?\s*(\d+)\s?)?" + r"[Cc]hapter\s*(\d+)([^:]*)" + r"(?::\s*(.+))?").match def _parse_chapter(self, info, manga, author, date=None): match = self._match_chapter(info) - volume, chapter, title = match.groups() if match else ("", "", info) - chapter, sep, minor = chapter.partition(".") + if match: + volume, chapter, minor, title = match.groups() + else: + volume = chapter = minor = "" + title = info return { "manga" : manga, @@ -39,7 +44,7 @@ class ManganeloBase(): "title" : text.unescape(title) if title else "", "volume" : text.parse_int(volume), "chapter" : text.parse_int(chapter), - "chapter_minor": sep + minor, + "chapter_minor": minor, "lang" : "en", "language" : "English", } @@ -61,6 +66,10 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): "keyword": "06e01fa9b3fc9b5b954c0d4a98f0153b40922ded", "count": 45, }), + ("https://chapmanganato.com/manga-no991297/chapter-8", { + "keyword": {"chapter": 8, "chapter_minor": "-1"}, + "count": 20, + }), ("https://readmanganato.com/manga-gn983696/chapter-23"), ("https://manganelo.com/chapter/gamers/chapter_15"), ("https://manganelo.com/chapter/gq921227/chapter_23"), diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py index 0f79d7f..24e676f 100644 --- a/gallery_dl/extractor/nana.py +++ b/gallery_dl/extractor/nana.py @@ -20,19 +20,23 @@ class NanaGalleryExtractor(GalleryExtractor): "059f7de55a4297413bfbd432ce7d6e724dd42bae"), { "pattern": r"https://nana\.my\.id/reader/" r"\w+/image/page\?path=.*\.\w+", - "title" : "Everybody Loves Shion", - "artist" : "fuzui", - "tags" : list, - "count" : 29, + "keyword": { + "title" : "Everybody Loves Shion", + "artist": "fuzui", + "tags" : list, + "count" : 29, + }, }), (("https://nana.my.id/reader/" "77c8712b67013e427923573379f5bafcc0c72e46"), { "pattern": r"https://nana\.my\.id/reader/" r"\w+/image/page\?path=.*\.\w+", - "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru", - "artist" : "Sueyuu", - "tags" : ["Sueyuu"], - "count" : 58, + "keyword": { + "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru", + "artist": "Sueyuu", + "tags" : ["Sueyuu"], + "count" : 58, + }, }), ) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 5f4ceea..beb3da2 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -162,7 +162,11 @@ class NitterExtractor(BaseExtractor): banner = extr('class="profile-banner"><a href="', '"') try: - uid = banner.split("%2F")[4] + if "/enc/" in banner: + uid = binascii.a2b_base64(banner.rpartition( + "/")[2]).decode().split("/")[4] + else: + uid = banner.split("%2F")[4] except Exception: uid = 0 @@ -302,7 +306,10 @@ class NitterTweetsExtractor(NitterExtractor): r"/media%2FCGMNYZvW0AIVoom\.jpg", "range": "1", }), - ("https://nitter.1d4.us/supernaturepics"), + ("https://nitter.1d4.us/supernaturepics", { + "range": "1", + "keyword": {"user": {"id": "2976459548"}}, + }), ("https://nitter.kavin.rocks/id:2976459548"), ("https://nitter.unixfox.eu/supernaturepics"), ) diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index f381f12..af2a367 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -63,10 +63,20 @@ class NozomiExtractor(Extractor): yield Message.Directory, post for post["num"], image in enumerate(images, 1): post["filename"] = post["dataid"] = did = image["dataid"] - post["extension"] = ext = image["type"] post["is_video"] = video = bool(image.get("is_video")) + + ext = image["type"] + if video: + subdomain = "v" + elif ext == "gif": + subdomain = "g" + else: + subdomain = "w" + ext = "webp" + + post["extension"] = ext post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format( - "v" if video else "i", did[-1], did[-3:-1], did, ext) + subdomain, did[-1], did[-3:-1], did, ext) yield Message.Url, url, post def posts(self): @@ -97,15 +107,17 @@ class NozomiPostExtractor(NozomiExtractor): pattern = r"(?:https?://)?nozomi\.la/post/(\d+)" test = ( ("https://nozomi.la/post/3649262.html", { - "url": "f4522adfc8159355fd0476de28761b5be0f02068", - "content": "cd20d2c5149871a0b80a1b0ce356526278964999", + "url": "e5525e717aec712843be8b88592d6406ae9e60ba", + "pattern": r"https://w\.nozomi\.la/2/15/aaa9f7c632cde1e1a5baaff3fb" + r"6a6d857ec73df7fdc5cf5a358caf604bf73152\.webp", + "content": "6d62c4a7fea50c0a89d499603c4e7a2b4b9bffa8", "keyword": { "artist" : ["hammer (sunset beach)"], "character": ["patchouli knowledge"], "copyright": ["touhou"], "dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5", "date" : "dt:2016-07-26 02:32:03", - "extension": "jpg", + "extension": "webp", "filename" : str, "height" : 768, "is_video" : False, @@ -118,14 +130,26 @@ class NozomiPostExtractor(NozomiExtractor): }), # multiple images per post ("https://nozomi.la/post/25588032.html", { - "url": "6aa3b7db385abcc9d374bdffd19187bccbf8f228", - "keyword": "2a2998af93c6438863c4077bd386b613b8bc2957", + "url": "fb956ccedcf2cf509739d26e2609e910244aa56c", + "keyword": "516ca5cbd0d2a46a8ce26679d6e08de5ac42184b", "count": 7, }), # empty 'date' (#1163) ("https://nozomi.la/post/130309.html", { "keyword": {"date": None}, - }) + }), + # gif + ("https://nozomi.la/post/1647.html", { + "pattern": r"https://g\.nozomi\.la/a/f0/d1b06469e00d72e4f6346209c1" + r"49db459d76b58a074416c260ed93cc31fa9f0a\.gif", + "content": "952efb78252bbc9fb56df2e8fafb68d5e6364181", + }), + # video + ("https://nozomi.la/post/2269847.html", { + "pattern": r"https://v\.nozomi\.la/d/0e/ff88398862669783691b31519f" + r"2bea3a35c24b6e62e3ba2d89b4409e41c660ed\.webm", + "content": "57065e6c16da7b1c7098a63b36fb0c6c6f1b9bca", + }), ) def __init__(self, match): @@ -160,7 +184,7 @@ class NozomiTagExtractor(NozomiExtractor): archive_fmt = "t_{search_tags}_{dataid}" pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\." test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { - "pattern": r"^https://[iv]\.nozomi\.la/\w/\w\w/\w+\.\w+$", + "pattern": r"^https://[wgv]\.nozomi\.la/\w/\w\w/\w+\.\w+$", "count": ">= 25", "range": "1-25", }) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index ec46ca3..404f296 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -71,8 +71,11 @@ class OAuthBase(Extractor): browser = self.config("browser", True) if browser: - import webbrowser - browser = webbrowser.get() + try: + import webbrowser + browser = webbrowser.get() + except Exception: + browser = None if browser and browser.open(url): name = getattr(browser, "name", "Browser") diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 56e3b39..f0a50c8 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,7 +14,7 @@ from .. import text class PahealExtractor(Extractor): """Base class for paheal extractors""" - basecategory = "booru" + basecategory = "shimmie2" category = "paheal" filename_fmt = "{category}_{id}_{md5}.{extension}" archive_fmt = "{id}" diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a17518f..b704031 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -596,6 +596,9 @@ class PixivSearchExtractor(PixivExtractor): sort_map = { "date": "date_asc", "date_d": "date_desc", + "popular_d": "popular_desc", + "popular_male_d": "popular_male_desc", + "popular_female_d": "popular_female_desc", } try: self.sort = sort = sort_map[sort] @@ -670,7 +673,7 @@ class PixivPixivisionExtractor(PixivExtractor): def works(self): return ( - self.api.illust_detail(illust_id) + self.api.illust_detail(illust_id.partition("?")[0]) for illust_id in util.unique_sequence(text.extract_iter( self.page, '<a href="https://www.pixiv.net/en/artworks/', '"')) ) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 305de2a..cefe8d3 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -303,8 +303,8 @@ class RedditImageExtractor(Extractor): category = "reddit" subcategory = "image" archive_fmt = "{filename}" - pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" - r"/[^/?#]+(?:\?[^#]*)?") + pattern = (r"(?:https?://)?((?:i|preview)\.redd\.it|i\.reddituploads\.com)" + r"/([^/?#]+)(\?[^#]*)?") test = ( ("https://i.redd.it/upjtjcx2npzz.jpg", { "url": "0de614900feef103e580b632190458c0b62b641a", @@ -315,12 +315,29 @@ class RedditImageExtractor(Extractor): "url": "f24f25efcedaddeec802e46c60d77ef975dc52a5", "content": "541dbcc3ad77aa01ee21ca49843c5e382371fae7", }), + # preview.redd.it -> i.redd.it + (("https://preview.redd.it/00af44lpn0u51.jpg?width=960&crop=smart" + "&auto=webp&v=enabled&s=dbca8ab84033f4a433772d9c15dbe0429c74e8ac"), { + "pattern": r"^https://i\.redd\.it/00af44lpn0u51\.jpg$" + }), ) + def __init__(self, match): + Extractor.__init__(self, match) + domain = match.group(1) + self.path = match.group(2) + if domain == "preview.redd.it": + self.domain = "i.redd.it" + self.query = "" + else: + self.domain = domain + self.query = match.group(3) or "" + def items(self): - data = text.nameext_from_url(self.url) + url = "https://{}/{}{}".format(self.domain, self.path, self.query) + data = text.nameext_from_url(url) yield Message.Directory, data - yield Message.Url, self.url, data + yield Message.Url, url, data class RedditAPI(): @@ -459,6 +476,9 @@ class RedditAPI(): def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) id_max = self._parse_id("id-max", float("inf")) + if id_max == 2147483647: + self.log.debug("Ignoring 'id-max' setting \"zik0zj\"") + id_max = float("inf") date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) while True: diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index ea4cf43..f36051b 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -13,6 +13,7 @@ from .common import Message from .. import text, util, exception from ..cache import cache import collections +import re BASE_PATTERN = r"(?:https?://)?" \ r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ @@ -101,6 +102,11 @@ class SankakuTagExtractor(SankakuExtractor): # match arbitrary query parameters ("https://chan.sankakucomplex.com" "/?tags=marie_rose&page=98&next=3874906&commit=Search"), + # 'date:' tags (#1790) + ("https://chan.sankakucomplex.com/?tags=date:2023-03-20", { + "range": "1", + "count": 1, + }), ) def __init__(self, match): @@ -108,6 +114,15 @@ class SankakuTagExtractor(SankakuExtractor): query = text.parse_query(match.group(1)) self.tags = text.unquote(query.get("tags", "").replace("+", " ")) + if "date:" in self.tags: + # rewrite 'date:' tags (#1790) + self.tags = re.sub( + r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)", + r"date:\3.\2.\1", self.tags) + self.tags = re.sub( + r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)", + r"date:\1.\2.\3", self.tags) + def metadata(self): return {"search_tags": self.tags} @@ -153,7 +168,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/show/(\d+)" + pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)" test = ( ("https://sankaku.app/post/show/360451", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", @@ -181,6 +196,17 @@ class SankakuPostExtractor(SankakuExtractor): "tags_general": ["key(mangaka)", "key(mangaka)"], }, }), + # md5 hexdigest instead of ID (#3952) + (("https://chan.sankakucomplex.com/post/show" + "/f8ba89043078f0e4be2d9c46550b840a"), { + "pattern": r"https://s\.sankakucomplex\.com" + r"/data/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg", + "count": 1, + "keyword": { + "id": 33195194, + "md5": "f8ba89043078f0e4be2d9c46550b840a", + }, + }), ("https://chan.sankakucomplex.com/post/show/360451"), ("https://chan.sankakucomplex.com/ja/post/show/360451"), ("https://beta.sankakucomplex.com/post/show/360451"), @@ -248,7 +274,7 @@ class SankakuAPI(): "lang" : "en", "page" : "1", "limit": "1", - "tags" : "id_range:" + post_id, + "tags" : ("md5:" if len(post_id) == 32 else "id_range:") + post_id, } return self._call("/posts", params) diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py new file mode 100644 index 0000000..285cd8f --- /dev/null +++ b/gallery_dl/extractor/shimmie2.py @@ -0,0 +1,326 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Shimmie2 instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class Shimmie2Extractor(BaseExtractor): + """Base class for shimmie2 extractors""" + basecategory = "shimmie2" + filename_fmt = "{category}_{id}{md5:?_//}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + + try: + instance = INSTANCES[self.category] + except KeyError: + pass + else: + cookies = instance.get("cookies") + if cookies: + domain = self.root.rpartition("/")[2] + self._update_cookies_dict(cookies, domain=domain) + file_url = instance.get("file_url") + if file_url: + self.file_url_fmt = file_url + + def items(self): + data = self.metadata() + + for post in self.posts(): + + for key in ("id", "width", "height"): + post[key] = text.parse_int(post[key]) + post["tags"] = text.unquote(post["tags"]) + post.update(data) + + url = post["file_url"] + if "/index.php?" in url: + post["filename"], _, post["extension"] = \ + url.rpartition("/")[2].rpartition(".") + else: + text.nameext_from_url(url, post) + + yield Message.Directory, post + yield Message.Url, url, post + + def metadata(self): + """Return general metadata""" + return () + + def posts(self): + """Return an iterable containing data of all relevant posts""" + return () + + +INSTANCES = { + "mememuseum": { + "root": "https://meme.museum", + "pattern": r"meme\.museum", + }, + "loudbooru": { + "root": "https://loudbooru.com", + "pattern": r"loudbooru\.com", + "cookies": {"ui-tnc-agreed": "true"}, + }, + "giantessbooru": { + "root": "https://giantessbooru.com", + "pattern": r"giantessbooru\.com", + "cookies": {"agreed": "true"}, + }, + "tentaclerape": { + "root": "https://tentaclerape.net", + "pattern": r"tentaclerape\.net", + }, + "cavemanon": { + "root": "https://booru.cavemanon.xyz", + "pattern": r"booru\.cavemanon\.xyz", + "file_url": "{0}/index.php?q=image/{2}.{4}" + }, +} + +BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=)?" + + +class Shimmie2TagExtractor(Shimmie2Extractor): + """Extractor for shimmie2 posts by tag search""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}" + pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?()" + test = ( + ("https://meme.museum/post/list/animated/1", { + "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20", + "count": ">= 30" + }), + ("https://loudbooru.com/post/list/original_character/1", { + "pattern": r"https://loudbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://giantessbooru.com/post/list/smiling/1", { + "pattern": r"https://giantessbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://tentaclerape.net/post/list/comic/1", { + "pattern": r"https://tentaclerape\.net/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://booru.cavemanon.xyz/index.php?q=post/list/Amber/1", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/\d+\.\w+", + "range": "1-100", + "count": 100, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + lastindex = match.lastindex + self.tags = text.unquote(match.group(lastindex-2)) + self.page = match.group(lastindex-1) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + pnum = text.parse_int(self.page, 1) + file_url_fmt = self.file_url_fmt.format + + init = True + mime = "" + + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + extr = text.extract_from(page) + + if init: + init = False + has_mime = ("data-mime='" in page) + has_pid = ("data-post-id='" in page) + + while True: + if has_mime: + mime = extr("data-mime='", "'") + if has_pid: + pid = extr("data-post-id='", "'") + else: + pid = extr("href='/post/view/", "?") + + if not pid: + break + + tags, dimensions, size = extr("title='", "'").split(" // ") + width, _, height = dimensions.partition("x") + md5 = extr("/_thumbs/", "/") + + yield { + "file_url": file_url_fmt( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2] if mime else "jpg"), + "id": pid, + "md5": md5, + "tags": tags, + "width": width, + "height": height, + "size": text.parse_bytes(size[:-1]), + } + + pnum += 1 + if not extr(">Next<", ">"): + if not extr("/{}'>{}<".format(pnum, pnum), ">"): + return + + +class Shimmie2PostExtractor(Shimmie2Extractor): + """Extractor for single shimmie2 posts""" + subcategory = "post" + pattern = BASE_PATTERN + r"post/view/(\d+)" + test = ( + ("https://meme.museum/post/view/10243", { + "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc" + r"49971f78/10243%20-%20g%20beard%20open_source%20richar" + r"d_stallman%20stallman%20tagme%20text\.jpg", + "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", + "keyword": { + "extension": "jpg", + "file_url": "https://meme.museum/_images/105febebcd5ca791ee332" + "adc49971f78/10243%20-%20g%20beard%20open_source%2" + "0richard_stallman%20stallman%20tagme%20text.jpg", + "filename": "10243 - g beard open_source richard_stallman " + "stallman tagme text", + "height": 451, + "id": 10243, + "md5": "105febebcd5ca791ee332adc49971f78", + "size": 0, + "subcategory": "post", + "tags": "/g/ beard open_source " + "richard_stallman stallman tagme text", + "width": 480, + }, + }), + ("https://loudbooru.com/post/view/33828", { + "pattern": r"https://loudbooru\.com/_images/.+\.png", + "content": "a4755f787ba23ae2aa297a46810f802ca9032739", + "keyword": { + "extension": "png", + "file_url": "https://loudbooru.com/_images/ca2638d903c86e8337f" + "e9aeb4974be88/33828%20-%202020%20artist%3Astikyfi" + "nkaz%20character%3Alisa_loud%20cover%20fanfiction" + "%3Aplatz_eins%20frowning%20half-closed_eyes%20sol" + "o%20text%20title_card.png", + "filename": "33828 - 2020 artist:stikyfinkaz character:lisa_" + "loud cover fanfiction:platz_eins frowning " + "half-closed_eyes solo text title_card", + "height": 1920, + "id": 33828, + "md5": "ca2638d903c86e8337fe9aeb4974be88", + "tags": "2020 artist:stikyfinkaz character:lisa_loud cover " + "fanfiction:platz_eins frowning half-closed_eyes " + "solo text title_card", + "width": 1078, + }, + }), + ("https://giantessbooru.com/post/view/41", { + "pattern": r"https://giantessbooru\.com/_images" + r"/3f67e1986496806b7b14ff3e82ac5af4/41\.jpg", + "content": "79115ed309d1f4e82e7bead6948760e889139c91", + "keyword": { + "extension": "jpg", + "file_url": "https://giantessbooru.com/_images" + "/3f67e1986496806b7b14ff3e82ac5af4/41.jpg", + "filename": "41", + "height": 0, + "id": 41, + "md5": "3f67e1986496806b7b14ff3e82ac5af4", + "size": 0, + "tags": "anime bare_midriff color drawing gentle giantess " + "karbo looking_at_tinies negeyari outdoors smiling " + "snake_girl white_hair", + "width": 0 + + + }, + }), + ("https://tentaclerape.net/post/view/10", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/10\.jpg", + "content": "d0fd8f0f6517a76cb5e23ba09f3844950bf2c516", + "keyword": { + "extension": "jpg", + "file_url": "https://tentaclerape.net/./index.php" + "?q=/image/10.jpg", + "filename": "10", + "height": 427, + "id": 10, + "md5": "945db71eeccaef82ce44b77564260c0b", + "size": 0, + "subcategory": "post", + "tags": "Deviant_Art Pet Tentacle artist_sche blonde_hair " + "blouse boots green_eyes highheels leash miniskirt " + "octopus schoolgirl white_skin willing", + "width": 300, + }, + }), + # video + ("https://tentaclerape.net/post/view/91267", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/91267\.mp4", + }), + ("https://booru.cavemanon.xyz/index.php?q=post/view/8335", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/8335\.png", + "content": "7158f7e4abbbf143bad5835eb93dbe4d68c1d4ab", + "keyword": { + "extension": "png", + "file_url": "https://booru.cavemanon.xyz" + "/index.php?q=image/8335.png", + "filename": "8335", + "height": 460, + "id": 8335, + "md5": "", + "size": 0, + "tags": "Color Fang", + "width": 459, + }, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + post = { + "id" : self.post_id, + "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + ( + extr("id='main_image' src='", "'") or + extr("<source src='", "'")), + "width" : extr("data-width=", " ").strip("\"'"), + "height" : extr("data-height=", ">").partition( + " ")[0].strip("\"'"), + "size" : 0, + } + + if not post["md5"]: + post["md5"] = text.extr(post["file_url"], "/_images/", "/") + + return (post,) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 155db1e..b45609d 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, oauth, exception -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta import re @@ -269,7 +269,7 @@ class TumblrExtractor(Extractor): class TumblrUserExtractor(TumblrExtractor): - """Extractor for all images from a tumblr-user""" + """Extractor for a Tumblr user's posts""" subcategory = "user" pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$" test = ( @@ -307,6 +307,16 @@ class TumblrUserExtractor(TumblrExtractor): "options": (("date-min", "201804"), ("date-max", "201805"), ("date-format", "%Y%m")) }), + # pagination with 'date-max' (#2191) and 'api-key' + ("https://donttrustthetits.tumblr.com/", { + "options": ( + ("access-token", None), + ("original", False), + ("date-max", "2015-04-25T00:00:00"), + ("date-min", "2015-04-01T00:00:00"), + ), + "count": 316, + }), ("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/archive"), ("tumblr:http://www.b-authentique.com/"), @@ -321,7 +331,7 @@ class TumblrUserExtractor(TumblrExtractor): class TumblrPostExtractor(TumblrExtractor): - """Extractor for images from a single post on tumblr""" + """Extractor for a single Tumblr post""" subcategory = "post" pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)" test = ( @@ -389,7 +399,7 @@ class TumblrPostExtractor(TumblrExtractor): class TumblrTagExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user by tag""" + """Extractor for Tumblr user's posts by tag""" subcategory = "tag" pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" test = ( @@ -411,8 +421,37 @@ class TumblrTagExtractor(TumblrExtractor): return self.api.posts(self.blog, {"tag": self.tag}) +class TumblrDayExtractor(TumblrExtractor): + """Extractor for Tumblr user's posts by day""" + subcategory = "day" + pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)" + test = ( + ("https://mikf123.tumblr.com/day/2018/01/05", { + "pattern": r"https://64\.media\.tumblr\.com" + r"/1a2be8c63f1df58abd2622861696c72a" + r"/tumblr_ozm9nqst9t1wgha4yo1_1280\.jpg", + "keyword": {"id": 169341068404}, + "count": 1, + }), + ("https://www.tumblr.com/blog/view/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/blog/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/mikf123/day/2018/01/05"), + ) + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + year, month, day = match.group(4).split("/") + self.date_min = ts = ( + # 719163 == date(1970, 1, 1).toordinal() + date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + self.api.before = ts + 86400 + + def posts(self): + return self.api.posts(self.blog, {}) + + class TumblrLikesExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user's liked posts""" + """Extractor for a Tumblr user's liked posts""" subcategory = "likes" directory_fmt = ("{category}", "{blog_name}", "likes") archive_fmt = "f_{blog[name]}_{id}_{num}" @@ -431,7 +470,11 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrAPI(oauth.OAuth1API): - """Minimal interface for the Tumblr API v2""" + """Interface for the Tumblr API v2 + + https://github.com/tumblr/docs/blob/master/api.md + """ + ROOT = "https://api.tumblr.com" API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B" API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03" BLOG_CACHE = {} @@ -442,55 +485,46 @@ class TumblrAPI(oauth.OAuth1API): def info(self, blog): """Return general information about a blog""" - if blog not in self.BLOG_CACHE: - self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"] - return self.BLOG_CACHE[blog] + try: + return self.BLOG_CACHE[blog] + except KeyError: + endpoint = "/v2/blog/{}/info".format(blog) + params = {"api_key": self.api_key} if self.api_key else None + self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"] + return blog def avatar(self, blog, size="512"): """Retrieve a blog avatar""" if self.api_key: - url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}" - return url_fmt.format(blog, size, self.api_key) + return "{}/v2/blog/{}/avatar/{}?api_key={}".format( + self.ROOT, blog, size, self.api_key) + endpoint = "/v2/blog/{}/avatar".format(blog) params = {"size": size} - data = self._call(blog, "avatar", params, allow_redirects=False) - return data["avatar_url"] + return self._call( + endpoint, params, allow_redirects=False)["avatar_url"] def posts(self, blog, params): """Retrieve published posts""" - params["offset"] = self.extractor.config("offset") or 0 - params["limit"] = 50 + params["offset"] = self.extractor.config("offset") + params["limit"] = "50" params["reblog_info"] = "true" + params["type"] = self.posts_type + params["before"] = self.before - if self.posts_type: - params["type"] = self.posts_type - if self.before: - params["before"] = self.before + if self.before and params["offset"]: + self.log.warning("'offset' and 'date-max' cannot be used together") - while True: - data = self._call(blog, "posts", params) - self.BLOG_CACHE[blog] = data["blog"] - yield from data["posts"] - params["offset"] += params["limit"] - if params["offset"] >= data["total_posts"]: - return + return self._pagination(blog, "/posts", params, cache=True) def likes(self, blog): """Retrieve liked posts""" params = {"limit": "50", "before": self.before} - while True: - posts = self._call(blog, "likes", params)["liked_posts"] - if not posts: - return - yield from posts - params["before"] = posts[-1]["liked_timestamp"] - - def _call(self, blog, endpoint, params, **kwargs): - if self.api_key: - params["api_key"] = self.api_key - url = "https://api.tumblr.com/v2/blog/{}/{}".format( - blog, endpoint) + return self._pagination(blog, "/likes", params, key="liked_posts") - response = self.request(url, params=params, **kwargs) + def _call(self, endpoint, params, **kwargs): + url = self.ROOT + endpoint + kwargs["params"] = params + response = self.request(url, **kwargs) try: data = response.json() @@ -535,7 +569,7 @@ class TumblrAPI(oauth.OAuth1API): if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) t = (datetime.now() + timedelta(seconds=float(reset))).time() raise exception.StopExtraction( @@ -547,6 +581,29 @@ class TumblrAPI(oauth.OAuth1API): if reset: self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) raise exception.StopExtraction(data) + + def _pagination(self, blog, endpoint, params, key="posts", cache=False): + endpoint = "/v2/blog/{}{}".format(blog, endpoint) + if self.api_key: + params["api_key"] = self.api_key + + while True: + data = self._call(endpoint, params) + + if cache: + self.BLOG_CACHE[blog] = data["blog"] + cache = False + + yield from data[key] + + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 2ccc7e5..5e68f13 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -45,7 +45,8 @@ class TwitterExtractor(Extractor): if not self.config("transform", True): self._transform_user = util.identity self._transform_tweet = util.identity - self._user = self._user_obj = None + self._user = None + self._user_obj = None self._user_cache = {} self._init_sizes() @@ -769,6 +770,13 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg", "count": 4, }), + # different 'user' and 'author' in quoted Tweet (#3922) + ("https://twitter.com/web/status/1644907989109751810", { + "keyword": { + "author": {"id": 321629993 , "name": "Cakes_Comics"}, + "user" : {"id": 718928225360080897, "name": "StobiesGalaxy"}, + }, + }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True), ("cards", False)), @@ -897,7 +905,8 @@ Your reaction.""", for tweet in self.api.tweet_detail(tweet_id): if tweet["rest_id"] == tweet_id or \ tweet.get("_retweet_id_str") == tweet_id: - self._assign_user(tweet["core"]["user_results"]["result"]) + if self._user_obj is None: + self._assign_user(tweet["core"]["user_results"]["result"]) tweets.append(tweet) tweet_id = tweet["legacy"].get("quoted_status_id_str") @@ -1561,9 +1570,9 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) - elif esw("homeConversation-"): - tweets.extend(entry["content"]["items"]) - elif esw("conversationthread-"): + elif esw(("homeConversation-", + "profile-conversation-", + "conversationthread-")): tweets.extend(entry["content"]["items"]) elif esw("tombstone-"): item = entry["content"]["itemContent"] diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py new file mode 100644 index 0000000..1cebdf7 --- /dev/null +++ b/gallery_dl/extractor/vipergirls.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://vipergirls.to/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?vipergirls\.to" + + +class VipergirlsExtractor(Extractor): + """Base class for vipergirls extractors""" + category = "vipergirls" + root = "https://vipergirls.to" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + + def items(self): + for html in self.posts(): + + pos = html.find('<a href="') + if pos < 0: + continue + + title = text.extr(html, '<h2 class="title', '<') + data = { + "title": text.unescape(title.partition(">")[2].strip()), + } + + yield Message.Directory, data + for href in text.extract_iter(html, '<a href="', '"', pos): + yield Message.Queue, href, data + + +class VipergirlsThreadExtractor(VipergirlsExtractor): + """Extractor for vipergirls threads""" + subcategory = "thread" + pattern = BASE_PATTERN + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?$" + test = ( + (("https://vipergirls.to/threads/4328304" + "-2011-05-28-Danica-Simply-Beautiful-x112-4500x3000"), { + "url": "b22feaa35a358bb36086c2b9353aee28989e1d7a", + "count": 227, + }), + ("https://vipergirls.to/threads/6858916-Karina/page4", { + "count": 1294, + }), + ("https://vipergirls.to/threads/4328304"), + ) + + def __init__(self, match): + VipergirlsExtractor.__init__(self, match) + self.thread_id, self.page = match.groups() + + def posts(self): + url = "{}/threads/{}{}".format( + self.root, self.thread_id, self.page or "") + + while True: + page = self.request(url).text + yield from text.extract_iter( + page, '<div class="postbody">', '</blockquote>') + + url = text.extr(page, '<a rel="next" href="', '"') + if not url: + return + url = "{}/{}".format(self.root, url) + + +class VipergirlsPostExtractor(VipergirlsExtractor): + """Extractor for vipergirls posts""" + subcategory = "post" + pattern = (BASE_PATTERN + + r"/threads/(\d+)(?:-[^/?#]+)?\?(p=\d+[^#]*)#post(\d+)") + test = ( + (("https://vipergirls.to/threads/4328304-2011-05-28-Danica-Simply-" + "Beautiful-x112-4500x3000?p=116038081&viewfull=1#post116038081"), { + "pattern": r"https://vipr\.im/\w{12}$", + "range": "2-113", + "count": 112, + "keyword": { + "title": "FemJoy Danica - Simply Beautiful (x112) 3000x4500", + }, + }), + ) + + def __init__(self, match): + VipergirlsExtractor.__init__(self, match) + self.thread_id, self.query, self.post_id = match.groups() + + def posts(self): + url = "{}/threads/{}?{}".format(self.root, self.thread_id, self.query) + page = self.request(url).text + + try: + pos = page.index('id="post_' + self.post_id + '"') + return (text.extract( + page, '<div class="postbody">', '</blockquote>', pos)[0],) + except Exception: + raise exception.NotFoundError("post") diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 714f4fe..5004bed 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -124,10 +124,8 @@ class MetadataPP(PostProcessor): for key, func in self.fields.items(): obj = kwdict try: - while "[" in key: - name, _, key = key.partition("[") - obj = obj[name] - key = key.rstrip("]") + if "[" in key: + obj, key = _traverse(obj, key) obj[key] = func(kwdict) except Exception: pass @@ -137,10 +135,8 @@ class MetadataPP(PostProcessor): for key in self.fields: obj = kwdict try: - while "[" in key: - name, _, key = key.partition("[") - obj = obj[name] - key = key.rstrip("]") + if "[" in key: + obj, key = _traverse(obj, key) del obj[key] except Exception: pass @@ -214,4 +210,15 @@ class MetadataPP(PostProcessor): ) +def _traverse(obj, key): + name, _, key = key.partition("[") + obj = obj[name] + + while "[" in key: + name, _, key = key.partition("[") + obj = obj[name.strip("\"']")] + + return obj, key.strip("\"']") + + __postprocessor__ = MetadataPP diff --git a/gallery_dl/version.py b/gallery_dl/version.py index c40736a..d4ef532 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.2" +__version__ = "1.25.3" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index b4638b7..eb09b9b 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -409,9 +409,12 @@ def parse_command_line(module, argv): "postprocessor_args": opts.postprocessor_args, "cn_verification_proxy": opts.cn_verification_proxy, "geo_verification_proxy": opts.geo_verification_proxy, - "geo_bypass": opts.geo_bypass, - "geo_bypass_country": opts.geo_bypass_country, - "geo_bypass_ip_block": opts.geo_bypass_ip_block, + "geo_bypass": getattr( + opts, "geo_bypass", "default"), + "geo_bypass_country": getattr( + opts, "geo_bypass_country", None), + "geo_bypass_ip_block": getattr( + opts, "geo_bypass_ip_block", None), "compat_opts": compat_opts, } |
