From 33d4eae5a6df8aaf6757f52ae25f514ff1211c62 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 30 Apr 2023 16:45:21 -0400 Subject: New upstream version 1.25.3. --- gallery_dl/downloader/http.py | 30 +++- gallery_dl/extractor/2chen.py | 35 ++-- gallery_dl/extractor/__init__.py | 4 +- gallery_dl/extractor/behance.py | 42 +++-- gallery_dl/extractor/deviantart.py | 23 ++- gallery_dl/extractor/imagefap.py | 26 ++- gallery_dl/extractor/imagehosts.py | 23 +++ gallery_dl/extractor/imgur.py | 6 +- gallery_dl/extractor/itchio.py | 82 +++++++++ gallery_dl/extractor/manganelo.py | 23 ++- gallery_dl/extractor/nana.py | 20 ++- gallery_dl/extractor/nitter.py | 11 +- gallery_dl/extractor/nozomi.py | 44 +++-- gallery_dl/extractor/oauth.py | 7 +- gallery_dl/extractor/paheal.py | 4 +- gallery_dl/extractor/pixiv.py | 5 +- gallery_dl/extractor/reddit.py | 28 ++- gallery_dl/extractor/sankaku.py | 30 +++- gallery_dl/extractor/shimmie2.py | 326 +++++++++++++++++++++++++++++++++++ gallery_dl/extractor/tumblr.py | 141 ++++++++++----- gallery_dl/extractor/twitter.py | 19 +- gallery_dl/extractor/vipergirls.py | 108 ++++++++++++ gallery_dl/postprocessor/metadata.py | 23 ++- gallery_dl/version.py | 2 +- gallery_dl/ytdl.py | 9 +- 25 files changed, 931 insertions(+), 140 deletions(-) create mode 100644 gallery_dl/extractor/itchio.py create mode 100644 gallery_dl/extractor/shimmie2.py create mode 100644 gallery_dl/extractor/vipergirls.py (limited to 'gallery_dl') diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 88e86e9..4ec0398 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -44,6 +44,12 @@ class HttpDownloader(DownloaderBase): self.mtime = self.config("mtime", True) self.rate = self.config("rate") + if not self.config("consume-content", False): + # this resets the underlying TCP connection, and therefore + # if the program makes another request to the same domain, + # a new connection (either TLS or plain TCP) must be made + self.release_conn = lambda resp: resp.close() + if self.retries < 0: self.retries = float("inf") if self.minsize: @@ -106,7 +112,7 @@ class HttpDownloader(DownloaderBase): while True: if tries: if response: - response.close() + self.release_conn(response) response = None self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: @@ -165,18 +171,24 @@ class HttpDownloader(DownloaderBase): retry = kwdict.get("_http_retry") if retry and retry(response): continue + self.release_conn(response) self.log.warning(msg) return False # check for invalid responses validate = kwdict.get("_http_validate") if validate and self.validate: - result = validate(response) + try: + result = validate(response) + except Exception: + self.release_conn(response) + raise if isinstance(result, str): url = result tries -= 1 continue if not result: + self.release_conn(response) self.log.warning("Invalid response") return False @@ -184,11 +196,13 @@ class HttpDownloader(DownloaderBase): size = text.parse_int(size, None) if size is not None: if self.minsize and size < self.minsize: + self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) return False if self.maxsize and size > self.maxsize: + self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) @@ -280,6 +294,18 @@ class HttpDownloader(DownloaderBase): return True + def release_conn(self, response): + """Release connection back to pool by consuming response body""" + try: + for _ in response.iter_content(self.chunk_size): + pass + except (RequestException, SSLError, OpenSSLError) as exc: + print() + self.log.debug( + "Unable to consume response body (%s: %s); " + "closing the connection anyway", exc.__class__.__name__, exc) + response.close() + @staticmethod def receive(fp, content, bytes_total, bytes_start): write = fp.write diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index d9674d8..f142690 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -4,35 +4,46 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://2chen.moe/""" +"""Extractors for https://sturdychan.help/""" from .common import Extractor, Message from .. import text +BASE_PATTERN = r"(?:https?://)?(?:sturdychan.help|2chen\.(?:moe|club))" + class _2chenThreadExtractor(Extractor): """Extractor for 2chen threads""" category = "2chen" subcategory = "thread" + root = "https://sturdychan.help" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time} {filename}.{extension}" archive_fmt = "{board}_{thread}_{hash}_{time}" - pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/(\d+)" test = ( - ("https://2chen.moe/tv/496715", { - "pattern": r"https://2chen\.su/assets/images/src/\w{40}\.\w+$", + ("https://sturdychan.help/tv/268929", { + "pattern": r"https://sturdychan\.help/assets/images" + r"/src/\w{40}\.\w+$", "count": ">= 179", + "keyword": { + "board": "tv", + "date": "type:datetime", + "hash": r"re:[0-9a-f]{40}", + "name": "Anonymous", + "no": r"re:\d+", + "thread": "268929", + "time": int, + "title": "「/ttg/ #118: 🇧🇷 edition」", + "url": str, + }, }), - ("https://2chen.club/tv/1", { - "count": 5, - }), - # 404 + ("https://2chen.club/tv/1"), ("https://2chen.moe/jp/303786"), ) def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) self.board, self.thread = match.groups() def items(self): @@ -88,9 +99,10 @@ class _2chenBoardExtractor(Extractor): """Extractor for 2chen boards""" category = "2chen" subcategory = "board" - pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)(?:/catalog|/?$)" + root = "https://sturdychan.help" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/catalog|/?$)" test = ( - ("https://2chen.moe/co/", { + ("https://sturdychan.help/co/", { "pattern": _2chenThreadExtractor.pattern }), ("https://2chen.moe/co"), @@ -100,7 +112,6 @@ class _2chenBoardExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) self.board = match.group(1) def items(self): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 553a110..9841ca7 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -74,6 +74,7 @@ modules = [ "instagram", "issuu", "itaku", + "itchio", "kabeuchi", "keenspot", "kemonoparty", @@ -93,7 +94,6 @@ modules = [ "mangapark", "mangasee", "mangoxo", - "mememuseum", "misskey", "myhentaigallery", "myportfolio", @@ -133,6 +133,7 @@ modules = [ "seiga", "senmanga", "sexcom", + "shimmie2", "simplyhentai", "skeb", "slickpic", @@ -156,6 +157,7 @@ modules = [ "urlshortener", "vanillarock", "vichan", + "vipergirls", "vk", "vsco", "wallhaven", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 1469aad..d8cc51d 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -81,10 +81,13 @@ class BehanceGalleryExtractor(BehanceExtractor): ("https://www.behance.net/gallery/88276087/Audi-R8-RWD", { "count": 20, "url": "6bebff0d37f85349f9ad28bd8b76fd66627c1e2f", + "pattern": r"https://mir-s3-cdn-cf\.behance\.net/project_modules" + r"/source/[0-9a-f]+.[0-9a-f]+\.jpg" }), # 'video' modules (#1282) ("https://www.behance.net/gallery/101185577/COLCCI", { - "pattern": r"ytdl:https://cdn-prod-ccv\.adobe\.com/", + "pattern": r"https://cdn-prod-ccv\.adobe\.com/\w+" + r"/rend/\w+_720\.mp4\?", "count": 3, }), ) @@ -129,26 +132,35 @@ class BehanceGalleryExtractor(BehanceExtractor): append = result.append for module in data["modules"]: - mtype = module["type"] + mtype = module["__typename"] - if mtype == "image": - url = module["sizes"]["original"] + if mtype == "ImageModule": + url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "video": - page = self.request(module["src"]).text - url = text.extr(page, '", "<")), + "description": text.unescape(extr( + 'id="gdesc_text"', '<').partition(">")[2]), + "categories": text.split_html(extr( + 'id="cnt_cats"', ''))[1::2], + "tags": text.split_html(extr( + 'id="cnt_tags"', ''))[1::2], "count": text.parse_int(extr(' 1 of ', ' pics"')), } diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d57ec89..df4ff26 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -135,6 +135,29 @@ class ImxtoImageExtractor(ImagehostImageExtractor): } +class ImxtoGalleryExtractor(ImagehostImageExtractor): + """Extractor for image galleries from imx.to""" + category = "imxto" + subcategory = "gallery" + pattern = r"(?:https?://)?(?:www\.)?(imx\.to/g/([^/?#]+))" + test = ("https://imx.to/g/ozdy", { + "pattern": ImxtoImageExtractor.pattern, + "keyword": {"title": "untitled gallery"}, + "count": 40, + }) + + def items(self): + page = self.request(self.page_url).text + title, pos = text.extract(page, '
")[2]).strip(), + } + + for url in text.extract_iter(page, "= 25", "range": "1-25", }) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index ec46ca3..404f296 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -71,8 +71,11 @@ class OAuthBase(Extractor): browser = self.config("browser", True) if browser: - import webbrowser - browser = webbrowser.get() + try: + import webbrowser + browser = webbrowser.get() + except Exception: + browser = None if browser and browser.open(url): name = getattr(browser, "name", "Browser") diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 56e3b39..f0a50c8 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,7 +14,7 @@ from .. import text class PahealExtractor(Extractor): """Base class for paheal extractors""" - basecategory = "booru" + basecategory = "shimmie2" category = "paheal" filename_fmt = "{category}_{id}_{md5}.{extension}" archive_fmt = "{id}" diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a17518f..b704031 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -596,6 +596,9 @@ class PixivSearchExtractor(PixivExtractor): sort_map = { "date": "date_asc", "date_d": "date_desc", + "popular_d": "popular_desc", + "popular_male_d": "popular_male_desc", + "popular_female_d": "popular_female_desc", } try: self.sort = sort = sort_map[sort] @@ -670,7 +673,7 @@ class PixivPixivisionExtractor(PixivExtractor): def works(self): return ( - self.api.illust_detail(illust_id) + self.api.illust_detail(illust_id.partition("?")[0]) for illust_id in util.unique_sequence(text.extract_iter( self.page, ' i.redd.it + (("https://preview.redd.it/00af44lpn0u51.jpg?width=960&crop=smart" + "&auto=webp&v=enabled&s=dbca8ab84033f4a433772d9c15dbe0429c74e8ac"), { + "pattern": r"^https://i\.redd\.it/00af44lpn0u51\.jpg$" + }), ) + def __init__(self, match): + Extractor.__init__(self, match) + domain = match.group(1) + self.path = match.group(2) + if domain == "preview.redd.it": + self.domain = "i.redd.it" + self.query = "" + else: + self.domain = domain + self.query = match.group(3) or "" + def items(self): - data = text.nameext_from_url(self.url) + url = "https://{}/{}{}".format(self.domain, self.path, self.query) + data = text.nameext_from_url(url) yield Message.Directory, data - yield Message.Url, self.url, data + yield Message.Url, url, data class RedditAPI(): @@ -459,6 +476,9 @@ class RedditAPI(): def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) id_max = self._parse_id("id-max", float("inf")) + if id_max == 2147483647: + self.log.debug("Ignoring 'id-max' setting \"zik0zj\"") + id_max = float("inf") date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) while True: diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index ea4cf43..f36051b 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -13,6 +13,7 @@ from .common import Message from .. import text, util, exception from ..cache import cache import collections +import re BASE_PATTERN = r"(?:https?://)?" \ r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ @@ -101,6 +102,11 @@ class SankakuTagExtractor(SankakuExtractor): # match arbitrary query parameters ("https://chan.sankakucomplex.com" "/?tags=marie_rose&page=98&next=3874906&commit=Search"), + # 'date:' tags (#1790) + ("https://chan.sankakucomplex.com/?tags=date:2023-03-20", { + "range": "1", + "count": 1, + }), ) def __init__(self, match): @@ -108,6 +114,15 @@ class SankakuTagExtractor(SankakuExtractor): query = text.parse_query(match.group(1)) self.tags = text.unquote(query.get("tags", "").replace("+", " ")) + if "date:" in self.tags: + # rewrite 'date:' tags (#1790) + self.tags = re.sub( + r"date:(\d\d)[.-](\d\d)[.-](\d\d\d\d)", + r"date:\3.\2.\1", self.tags) + self.tags = re.sub( + r"date:(\d\d\d\d)[.-](\d\d)[.-](\d\d)", + r"date:\1.\2.\3", self.tags) + def metadata(self): return {"search_tags": self.tags} @@ -153,7 +168,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/show/(\d+)" + pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)" test = ( ("https://sankaku.app/post/show/360451", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", @@ -181,6 +196,17 @@ class SankakuPostExtractor(SankakuExtractor): "tags_general": ["key(mangaka)", "key(mangaka)"], }, }), + # md5 hexdigest instead of ID (#3952) + (("https://chan.sankakucomplex.com/post/show" + "/f8ba89043078f0e4be2d9c46550b840a"), { + "pattern": r"https://s\.sankakucomplex\.com" + r"/data/f8/ba/f8ba89043078f0e4be2d9c46550b840a\.jpg", + "count": 1, + "keyword": { + "id": 33195194, + "md5": "f8ba89043078f0e4be2d9c46550b840a", + }, + }), ("https://chan.sankakucomplex.com/post/show/360451"), ("https://chan.sankakucomplex.com/ja/post/show/360451"), ("https://beta.sankakucomplex.com/post/show/360451"), @@ -248,7 +274,7 @@ class SankakuAPI(): "lang" : "en", "page" : "1", "limit": "1", - "tags" : "id_range:" + post_id, + "tags" : ("md5:" if len(post_id) == 32 else "id_range:") + post_id, } return self._call("/posts", params) diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py new file mode 100644 index 0000000..285cd8f --- /dev/null +++ b/gallery_dl/extractor/shimmie2.py @@ -0,0 +1,326 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Shimmie2 instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class Shimmie2Extractor(BaseExtractor): + """Base class for shimmie2 extractors""" + basecategory = "shimmie2" + filename_fmt = "{category}_{id}{md5:?_//}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + + try: + instance = INSTANCES[self.category] + except KeyError: + pass + else: + cookies = instance.get("cookies") + if cookies: + domain = self.root.rpartition("/")[2] + self._update_cookies_dict(cookies, domain=domain) + file_url = instance.get("file_url") + if file_url: + self.file_url_fmt = file_url + + def items(self): + data = self.metadata() + + for post in self.posts(): + + for key in ("id", "width", "height"): + post[key] = text.parse_int(post[key]) + post["tags"] = text.unquote(post["tags"]) + post.update(data) + + url = post["file_url"] + if "/index.php?" in url: + post["filename"], _, post["extension"] = \ + url.rpartition("/")[2].rpartition(".") + else: + text.nameext_from_url(url, post) + + yield Message.Directory, post + yield Message.Url, url, post + + def metadata(self): + """Return general metadata""" + return () + + def posts(self): + """Return an iterable containing data of all relevant posts""" + return () + + +INSTANCES = { + "mememuseum": { + "root": "https://meme.museum", + "pattern": r"meme\.museum", + }, + "loudbooru": { + "root": "https://loudbooru.com", + "pattern": r"loudbooru\.com", + "cookies": {"ui-tnc-agreed": "true"}, + }, + "giantessbooru": { + "root": "https://giantessbooru.com", + "pattern": r"giantessbooru\.com", + "cookies": {"agreed": "true"}, + }, + "tentaclerape": { + "root": "https://tentaclerape.net", + "pattern": r"tentaclerape\.net", + }, + "cavemanon": { + "root": "https://booru.cavemanon.xyz", + "pattern": r"booru\.cavemanon\.xyz", + "file_url": "{0}/index.php?q=image/{2}.{4}" + }, +} + +BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=)?" + + +class Shimmie2TagExtractor(Shimmie2Extractor): + """Extractor for shimmie2 posts by tag search""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}" + pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?()" + test = ( + ("https://meme.museum/post/list/animated/1", { + "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20", + "count": ">= 30" + }), + ("https://loudbooru.com/post/list/original_character/1", { + "pattern": r"https://loudbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://giantessbooru.com/post/list/smiling/1", { + "pattern": r"https://giantessbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://tentaclerape.net/post/list/comic/1", { + "pattern": r"https://tentaclerape\.net/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://booru.cavemanon.xyz/index.php?q=post/list/Amber/1", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/\d+\.\w+", + "range": "1-100", + "count": 100, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + lastindex = match.lastindex + self.tags = text.unquote(match.group(lastindex-2)) + self.page = match.group(lastindex-1) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + pnum = text.parse_int(self.page, 1) + file_url_fmt = self.file_url_fmt.format + + init = True + mime = "" + + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + extr = text.extract_from(page) + + if init: + init = False + has_mime = ("data-mime='" in page) + has_pid = ("data-post-id='" in page) + + while True: + if has_mime: + mime = extr("data-mime='", "'") + if has_pid: + pid = extr("data-post-id='", "'") + else: + pid = extr("href='/post/view/", "?") + + if not pid: + break + + tags, dimensions, size = extr("title='", "'").split(" // ") + width, _, height = dimensions.partition("x") + md5 = extr("/_thumbs/", "/") + + yield { + "file_url": file_url_fmt( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2] if mime else "jpg"), + "id": pid, + "md5": md5, + "tags": tags, + "width": width, + "height": height, + "size": text.parse_bytes(size[:-1]), + } + + pnum += 1 + if not extr(">Next<", ">"): + if not extr("/{}'>{}<".format(pnum, pnum), ">"): + return + + +class Shimmie2PostExtractor(Shimmie2Extractor): + """Extractor for single shimmie2 posts""" + subcategory = "post" + pattern = BASE_PATTERN + r"post/view/(\d+)" + test = ( + ("https://meme.museum/post/view/10243", { + "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc" + r"49971f78/10243%20-%20g%20beard%20open_source%20richar" + r"d_stallman%20stallman%20tagme%20text\.jpg", + "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", + "keyword": { + "extension": "jpg", + "file_url": "https://meme.museum/_images/105febebcd5ca791ee332" + "adc49971f78/10243%20-%20g%20beard%20open_source%2" + "0richard_stallman%20stallman%20tagme%20text.jpg", + "filename": "10243 - g beard open_source richard_stallman " + "stallman tagme text", + "height": 451, + "id": 10243, + "md5": "105febebcd5ca791ee332adc49971f78", + "size": 0, + "subcategory": "post", + "tags": "/g/ beard open_source " + "richard_stallman stallman tagme text", + "width": 480, + }, + }), + ("https://loudbooru.com/post/view/33828", { + "pattern": r"https://loudbooru\.com/_images/.+\.png", + "content": "a4755f787ba23ae2aa297a46810f802ca9032739", + "keyword": { + "extension": "png", + "file_url": "https://loudbooru.com/_images/ca2638d903c86e8337f" + "e9aeb4974be88/33828%20-%202020%20artist%3Astikyfi" + "nkaz%20character%3Alisa_loud%20cover%20fanfiction" + "%3Aplatz_eins%20frowning%20half-closed_eyes%20sol" + "o%20text%20title_card.png", + "filename": "33828 - 2020 artist:stikyfinkaz character:lisa_" + "loud cover fanfiction:platz_eins frowning " + "half-closed_eyes solo text title_card", + "height": 1920, + "id": 33828, + "md5": "ca2638d903c86e8337fe9aeb4974be88", + "tags": "2020 artist:stikyfinkaz character:lisa_loud cover " + "fanfiction:platz_eins frowning half-closed_eyes " + "solo text title_card", + "width": 1078, + }, + }), + ("https://giantessbooru.com/post/view/41", { + "pattern": r"https://giantessbooru\.com/_images" + r"/3f67e1986496806b7b14ff3e82ac5af4/41\.jpg", + "content": "79115ed309d1f4e82e7bead6948760e889139c91", + "keyword": { + "extension": "jpg", + "file_url": "https://giantessbooru.com/_images" + "/3f67e1986496806b7b14ff3e82ac5af4/41.jpg", + "filename": "41", + "height": 0, + "id": 41, + "md5": "3f67e1986496806b7b14ff3e82ac5af4", + "size": 0, + "tags": "anime bare_midriff color drawing gentle giantess " + "karbo looking_at_tinies negeyari outdoors smiling " + "snake_girl white_hair", + "width": 0 + + + }, + }), + ("https://tentaclerape.net/post/view/10", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/10\.jpg", + "content": "d0fd8f0f6517a76cb5e23ba09f3844950bf2c516", + "keyword": { + "extension": "jpg", + "file_url": "https://tentaclerape.net/./index.php" + "?q=/image/10.jpg", + "filename": "10", + "height": 427, + "id": 10, + "md5": "945db71eeccaef82ce44b77564260c0b", + "size": 0, + "subcategory": "post", + "tags": "Deviant_Art Pet Tentacle artist_sche blonde_hair " + "blouse boots green_eyes highheels leash miniskirt " + "octopus schoolgirl white_skin willing", + "width": 300, + }, + }), + # video + ("https://tentaclerape.net/post/view/91267", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/91267\.mp4", + }), + ("https://booru.cavemanon.xyz/index.php?q=post/view/8335", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/8335\.png", + "content": "7158f7e4abbbf143bad5835eb93dbe4d68c1d4ab", + "keyword": { + "extension": "png", + "file_url": "https://booru.cavemanon.xyz" + "/index.php?q=image/8335.png", + "filename": "8335", + "height": 460, + "id": 8335, + "md5": "", + "size": 0, + "tags": "Color Fang", + "width": 459, + }, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + post = { + "id" : self.post_id, + "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + ( + extr("id='main_image' src='", "'") or + extr("").partition( + " ")[0].strip("\"'"), + "size" : 0, + } + + if not post["md5"]: + post["md5"] = text.extr(post["file_url"], "/_images/", "/") + + return (post,) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 155db1e..b45609d 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, oauth, exception -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta import re @@ -269,7 +269,7 @@ class TumblrExtractor(Extractor): class TumblrUserExtractor(TumblrExtractor): - """Extractor for all images from a tumblr-user""" + """Extractor for a Tumblr user's posts""" subcategory = "user" pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$" test = ( @@ -307,6 +307,16 @@ class TumblrUserExtractor(TumblrExtractor): "options": (("date-min", "201804"), ("date-max", "201805"), ("date-format", "%Y%m")) }), + # pagination with 'date-max' (#2191) and 'api-key' + ("https://donttrustthetits.tumblr.com/", { + "options": ( + ("access-token", None), + ("original", False), + ("date-max", "2015-04-25T00:00:00"), + ("date-min", "2015-04-01T00:00:00"), + ), + "count": 316, + }), ("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/archive"), ("tumblr:http://www.b-authentique.com/"), @@ -321,7 +331,7 @@ class TumblrUserExtractor(TumblrExtractor): class TumblrPostExtractor(TumblrExtractor): - """Extractor for images from a single post on tumblr""" + """Extractor for a single Tumblr post""" subcategory = "post" pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)" test = ( @@ -389,7 +399,7 @@ class TumblrPostExtractor(TumblrExtractor): class TumblrTagExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user by tag""" + """Extractor for Tumblr user's posts by tag""" subcategory = "tag" pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" test = ( @@ -411,8 +421,37 @@ class TumblrTagExtractor(TumblrExtractor): return self.api.posts(self.blog, {"tag": self.tag}) +class TumblrDayExtractor(TumblrExtractor): + """Extractor for Tumblr user's posts by day""" + subcategory = "day" + pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)" + test = ( + ("https://mikf123.tumblr.com/day/2018/01/05", { + "pattern": r"https://64\.media\.tumblr\.com" + r"/1a2be8c63f1df58abd2622861696c72a" + r"/tumblr_ozm9nqst9t1wgha4yo1_1280\.jpg", + "keyword": {"id": 169341068404}, + "count": 1, + }), + ("https://www.tumblr.com/blog/view/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/blog/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/mikf123/day/2018/01/05"), + ) + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + year, month, day = match.group(4).split("/") + self.date_min = ts = ( + # 719163 == date(1970, 1, 1).toordinal() + date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + self.api.before = ts + 86400 + + def posts(self): + return self.api.posts(self.blog, {}) + + class TumblrLikesExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user's liked posts""" + """Extractor for a Tumblr user's liked posts""" subcategory = "likes" directory_fmt = ("{category}", "{blog_name}", "likes") archive_fmt = "f_{blog[name]}_{id}_{num}" @@ -431,7 +470,11 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrAPI(oauth.OAuth1API): - """Minimal interface for the Tumblr API v2""" + """Interface for the Tumblr API v2 + + https://github.com/tumblr/docs/blob/master/api.md + """ + ROOT = "https://api.tumblr.com" API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B" API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03" BLOG_CACHE = {} @@ -442,55 +485,46 @@ class TumblrAPI(oauth.OAuth1API): def info(self, blog): """Return general information about a blog""" - if blog not in self.BLOG_CACHE: - self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"] - return self.BLOG_CACHE[blog] + try: + return self.BLOG_CACHE[blog] + except KeyError: + endpoint = "/v2/blog/{}/info".format(blog) + params = {"api_key": self.api_key} if self.api_key else None + self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"] + return blog def avatar(self, blog, size="512"): """Retrieve a blog avatar""" if self.api_key: - url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}" - return url_fmt.format(blog, size, self.api_key) + return "{}/v2/blog/{}/avatar/{}?api_key={}".format( + self.ROOT, blog, size, self.api_key) + endpoint = "/v2/blog/{}/avatar".format(blog) params = {"size": size} - data = self._call(blog, "avatar", params, allow_redirects=False) - return data["avatar_url"] + return self._call( + endpoint, params, allow_redirects=False)["avatar_url"] def posts(self, blog, params): """Retrieve published posts""" - params["offset"] = self.extractor.config("offset") or 0 - params["limit"] = 50 + params["offset"] = self.extractor.config("offset") + params["limit"] = "50" params["reblog_info"] = "true" + params["type"] = self.posts_type + params["before"] = self.before - if self.posts_type: - params["type"] = self.posts_type - if self.before: - params["before"] = self.before + if self.before and params["offset"]: + self.log.warning("'offset' and 'date-max' cannot be used together") - while True: - data = self._call(blog, "posts", params) - self.BLOG_CACHE[blog] = data["blog"] - yield from data["posts"] - params["offset"] += params["limit"] - if params["offset"] >= data["total_posts"]: - return + return self._pagination(blog, "/posts", params, cache=True) def likes(self, blog): """Retrieve liked posts""" params = {"limit": "50", "before": self.before} - while True: - posts = self._call(blog, "likes", params)["liked_posts"] - if not posts: - return - yield from posts - params["before"] = posts[-1]["liked_timestamp"] - - def _call(self, blog, endpoint, params, **kwargs): - if self.api_key: - params["api_key"] = self.api_key - url = "https://api.tumblr.com/v2/blog/{}/{}".format( - blog, endpoint) + return self._pagination(blog, "/likes", params, key="liked_posts") - response = self.request(url, params=params, **kwargs) + def _call(self, endpoint, params, **kwargs): + url = self.ROOT + endpoint + kwargs["params"] = params + response = self.request(url, **kwargs) try: data = response.json() @@ -535,7 +569,7 @@ class TumblrAPI(oauth.OAuth1API): if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) t = (datetime.now() + timedelta(seconds=float(reset))).time() raise exception.StopExtraction( @@ -547,6 +581,29 @@ class TumblrAPI(oauth.OAuth1API): if reset: self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) raise exception.StopExtraction(data) + + def _pagination(self, blog, endpoint, params, key="posts", cache=False): + endpoint = "/v2/blog/{}{}".format(blog, endpoint) + if self.api_key: + params["api_key"] = self.api_key + + while True: + data = self._call(endpoint, params) + + if cache: + self.BLOG_CACHE[blog] = data["blog"] + cache = False + + yield from data[key] + + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 2ccc7e5..5e68f13 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -45,7 +45,8 @@ class TwitterExtractor(Extractor): if not self.config("transform", True): self._transform_user = util.identity self._transform_tweet = util.identity - self._user = self._user_obj = None + self._user = None + self._user_obj = None self._user_cache = {} self._init_sizes() @@ -769,6 +770,13 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://pbs\.twimg\.com/media/EaK.+=jpg", "count": 4, }), + # different 'user' and 'author' in quoted Tweet (#3922) + ("https://twitter.com/web/status/1644907989109751810", { + "keyword": { + "author": {"id": 321629993 , "name": "Cakes_Comics"}, + "user" : {"id": 718928225360080897, "name": "StobiesGalaxy"}, + }, + }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True), ("cards", False)), @@ -897,7 +905,8 @@ Your reaction.""", for tweet in self.api.tweet_detail(tweet_id): if tweet["rest_id"] == tweet_id or \ tweet.get("_retweet_id_str") == tweet_id: - self._assign_user(tweet["core"]["user_results"]["result"]) + if self._user_obj is None: + self._assign_user(tweet["core"]["user_results"]["result"]) tweets.append(tweet) tweet_id = tweet["legacy"].get("quoted_status_id_str") @@ -1561,9 +1570,9 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) - elif esw("homeConversation-"): - tweets.extend(entry["content"]["items"]) - elif esw("conversationthread-"): + elif esw(("homeConversation-", + "profile-conversation-", + "conversationthread-")): tweets.extend(entry["content"]["items"]) elif esw("tombstone-"): item = entry["content"]["itemContent"] diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py new file mode 100644 index 0000000..1cebdf7 --- /dev/null +++ b/gallery_dl/extractor/vipergirls.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://vipergirls.to/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?vipergirls\.to" + + +class VipergirlsExtractor(Extractor): + """Base class for vipergirls extractors""" + category = "vipergirls" + root = "https://vipergirls.to" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + + def items(self): + for html in self.posts(): + + pos = html.find('")[2].strip()), + } + + yield Message.Directory, data + for href in text.extract_iter(html, '', '') + + url = text.extr(page, '