diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/booru.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/bunkr.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 23 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 25 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 185 | ||||
| -rw-r--r-- | gallery_dl/extractor/fantia.py | 17 | ||||
| -rw-r--r-- | gallery_dl/extractor/generic.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/hiperdex.py | 27 | ||||
| -rw-r--r-- | gallery_dl/extractor/hotleak.py | 13 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 47 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 23 | ||||
| -rw-r--r-- | gallery_dl/extractor/lexica.py | 104 | ||||
| -rw-r--r-- | gallery_dl/extractor/mastodon.py | 19 | ||||
| -rw-r--r-- | gallery_dl/extractor/nudecollect.py | 142 | ||||
| -rw-r--r-- | gallery_dl/extractor/oauth.py | 163 | ||||
| -rw-r--r-- | gallery_dl/extractor/philomena.py | 11 | ||||
| -rw-r--r-- | gallery_dl/extractor/sankaku.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 127 | ||||
| -rw-r--r-- | gallery_dl/extractor/wikifeet.py | 118 |
20 files changed, 844 insertions, 226 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index f26f6a9..6140c2c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -77,6 +77,7 @@ modules = [ "kemonoparty", "khinsider", "komikcast", + "lexica", "lightroom", "lineblog", "livedoor", @@ -102,6 +103,7 @@ modules = [ "nitter", "nozomi", "nsfwalbum", + "nudecollect", "paheal", "patreon", "philomena", @@ -158,6 +160,7 @@ modules = [ "webtoons", "weibo", "wikiart", + "wikifeet", "xhamster", "xvideos", "zerochan", diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 0d7d13d..cbd0e07 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -27,6 +27,10 @@ class BooruExtractor(BaseExtractor): notes = self.config("notes", False) fetch_html = tags or notes + url_key = self.config("url") + if url_key: + self._file_url = operator.itemgetter(url_key) + for post in self.posts(): try: url = self._file_url(post) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 8283fbc..1c339a9 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -75,7 +75,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): headers = {"Referer": root.replace("://", "://stream.", 1) + "/"} for file in files: if file["file"].endswith( - (".mp4", ".m4v", ".mov", ".webm", ".zip", ".rar", ".7z")): + (".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", + ".zip", ".rar", ".7z")): file["_http_headers"] = headers file["file"] = file["file"].replace( "://cdn", "://media-files", 1) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index ad766da..4cefa1c 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -36,6 +36,7 @@ class Extractor(): browser = None root = "" test = None + finalize = None request_interval = 0.0 request_interval_min = 0.0 request_timestamp = 0.0 @@ -44,7 +45,6 @@ class Extractor(): def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string - self.finalize = None if self.basecategory: self.config = self._config_shared @@ -53,6 +53,7 @@ class Extractor(): self._parentdir = "" self._write_pages = self.config("write-pages", False) + self._retry_codes = self.config("retry-codes") self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) @@ -64,6 +65,8 @@ class Extractor(): if self._retries < 0: self._retries = float("inf") + if not self._retry_codes: + self._retry_codes = () self._init_session() self._init_cookies() @@ -103,12 +106,15 @@ class Extractor(): values[:0] = config.accumulate((self.subcategory,), key, conf=conf) return values - def request(self, url, *, method="GET", session=None, retries=None, - encoding=None, fatal=True, notfound=None, **kwargs): + def request(self, url, *, method="GET", session=None, + retries=None, retry_codes=None, encoding=None, + fatal=True, notfound=None, **kwargs): if session is None: session = self.session if retries is None: retries = self._retries + if retry_codes is None: + retry_codes = self._retry_codes if "proxies" not in kwargs: kwargs["proxies"] = self._proxies if "timeout" not in kwargs: @@ -153,12 +159,12 @@ class Extractor(): code in (403, 503): content = response.content if b"_cf_chl_opt" in content or b"jschl-answer" in content: - self.log.warning("Cloudflare IUAM challenge") + self.log.warning("Cloudflare challenge") break if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break - if code < 500 and code != 429 and code != 430: + if code not in retry_codes and code < 500: break finally: @@ -501,7 +507,10 @@ class Extractor(): try: with open(path + ".txt", 'wb') as fp: util.dump_response( - response, fp, headers=(self._write_pages == "all")) + response, fp, + headers=(self._write_pages in ("all", "ALL")), + hide_auth=(self._write_pages != "ALL") + ) except Exception as e: self.log.warning("Failed to dump HTTP request (%s: %s)", e.__class__.__name__, e) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 4c93604..7b0e572 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -40,7 +40,17 @@ class DanbooruExtractor(BaseExtractor): self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) - self.extended_metadata = self.config("metadata", False) + + metadata = self.config("metadata", False) + if metadata: + if isinstance(metadata, (list, tuple)): + metadata = ",".join(metadata) + elif not isinstance(metadata, str): + metadata = "artist_commentary,children,notes,parent,uploader" + self.metadata_includes = metadata + else: + self.metadata_includes = None + threshold = self.config("threshold") if isinstance(threshold, int): self.threshold = 1 if threshold < 1 else threshold @@ -99,13 +109,10 @@ class DanbooruExtractor(BaseExtractor): url = post["large_file_url"] post["extension"] = "webm" - if self.extended_metadata: - template = ( - "{}/posts/{}.json?only=artist_commentary,children,notes," - "parent,uploader" - ) - resp = self.request(template.format(self.root, post["id"])) - post.update(resp.json()) + if self.metadata_includes: + meta_url = "{}/posts/{}.json?only={}".format( + self.root, post["id"], self.metadata_includes) + post.update(self.request(meta_url).json()) if url[0] == "/": url = self.root + url diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index aeb2d0a..a3187fa 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -118,11 +118,18 @@ class DeviantartExtractor(Extractor): if "flash" in deviation: yield self.commit(deviation, deviation["flash"]) - if "excerpt" in deviation and self.commit_journal: - journal = self.api.deviation_content(deviation["deviationid"]) - if self.extra: - deviation["_journal"] = journal["html"] - yield self.commit_journal(deviation, journal) + if self.commit_journal: + if "excerpt" in deviation: + journal = self.api.deviation_content( + deviation["deviationid"]) + elif "body" in deviation: + journal = {"html": deviation.pop("body")} + else: + journal = None + if journal: + if self.extra: + deviation["_journal"] = journal["html"] + yield self.commit_journal(deviation, journal) if not self.extra: continue @@ -150,10 +157,19 @@ class DeviantartExtractor(Extractor): """Adjust the contents of a Deviation-object""" if "index" not in deviation: try: - deviation["index"] = text.parse_int( - deviation["url"].rpartition("-")[2]) + if deviation["url"].startswith("https://sta.sh"): + filename = deviation["content"]["src"].split("/")[5] + deviation["index_base36"] = filename.partition("-")[0][1:] + deviation["index"] = id_from_base36( + deviation["index_base36"]) + else: + deviation["index"] = text.parse_int( + deviation["url"].rpartition("-")[2]) except KeyError: deviation["index"] = 0 + deviation["index_base36"] = "0" + if "index_base36" not in deviation: + deviation["index_base36"] = base36_from_id(deviation["index"]) if self.user: deviation["username"] = self.user @@ -170,13 +186,11 @@ class DeviantartExtractor(Extractor): if self.comments: deviation["comments"] = ( - self.api.comments_deviation(deviation["deviationid"]) + self.api.comments(deviation["deviationid"], target="deviation") if deviation["stats"]["comments"] else () ) # filename metadata - alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" - deviation["index_base36"] = util.bencode(deviation["index"], alphabet) sub = re.compile(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", @@ -253,9 +267,10 @@ class DeviantartExtractor(Extractor): html = journal["html"] if html.startswith("<style"): html = html.partition("</style>")[2] + head, _, tail = html.rpartition("<script") content = "\n".join( text.unescape(text.remove_html(txt)) - for txt in html.rpartition("<script")[0].split("<br />") + for txt in (head or tail).split("<br />") ) txt = JOURNAL_TEMPLATE_TEXT.format( title=deviation["title"], @@ -402,8 +417,9 @@ class DeviantartUserExtractor(DeviantartExtractor): }), ("https://www.deviantart.com/shimoda7", { "options": (("include", "all"),), - "pattern": r"/shimoda7/(gallery(/scraps)?|posts|favourites)$", - "count": 4, + "pattern": r"/shimoda7/" + r"(gallery(/scraps)?|posts(/statuses)?|favourites)$", + "count": 5, }), ("https://shimoda7.deviantart.com/"), ) @@ -414,6 +430,7 @@ class DeviantartUserExtractor(DeviantartExtractor): (DeviantartGalleryExtractor , base + "gallery"), (DeviantartScrapsExtractor , base + "gallery/scraps"), (DeviantartJournalExtractor , base + "posts"), + (DeviantartStatusExtractor , base + "posts/statuses"), (DeviantartFavoriteExtractor, base + "favourites"), ), ("gallery",)) @@ -746,6 +763,97 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) +class DeviantartStatusExtractor(DeviantartExtractor): + """Extractor for an artist's status updates""" + subcategory = "status" + directory_fmt = ("{category}", "{username}", "Status") + filename_fmt = "{category}_{index}_{title}_{date}.{extension}" + archive_fmt = "S_{_username}_{index}.{extension}" + pattern = BASE_PATTERN + r"/posts/statuses" + test = ( + ("https://www.deviantart.com/t1na/posts/statuses", { + "count": 0, + }), + ("https://www.deviantart.com/justgalym/posts/statuses", { + "count": 4, + "url": "bf4c44c0c60ff2648a880f4c3723464ad3e7d074", + }), + # shared deviation + ("https://www.deviantart.com/justgalym/posts/statuses", { + "options": (("journals", "none"),), + "count": 1, + "pattern": r"https://images-wixmp-\w+\.wixmp\.com/f" + r"/[^/]+/[^.]+\.jpg\?token=", + }), + # shared sta.sh item + ("https://www.deviantart.com/vanillaghosties/posts/statuses", { + "options": (("journals", "none"), ("original", False)), + "range": "5-", + "count": 1, + "keyword": { + "index" : int, + "index_base36": "re:^[0-9a-z]+$", + "url" : "re:^https://sta.sh", + }, + }), + ("https://www.deviantart.com/justgalym/posts/statuses", { + "options": (("journals", "text"),), + "url": "c8744f7f733a3029116607b826321233c5ca452d", + }), + ) + + def deviations(self): + for status in self.api.user_statuses(self.user, self.offset): + yield from self.status(status) + + def status(self, status): + for item in status.get("items") or (): # do not trust is_share + # shared deviations/statuses + if "deviation" in item: + yield item["deviation"].copy() + if "status" in item: + yield from self.status(item["status"].copy()) + # assume is_deleted == true means necessary fields are missing + if status["is_deleted"]: + self.log.warning( + "Skipping status %s (deleted)", status.get("statusid")) + return + yield status + + def prepare(self, deviation): + if "deviationid" in deviation: + return DeviantartExtractor.prepare(self, deviation) + + try: + path = deviation["url"].split("/") + deviation["index"] = text.parse_int(path[-1] or path[-2]) + except KeyError: + deviation["index"] = 0 + + if self.user: + deviation["username"] = self.user + deviation["_username"] = self.user.lower() + else: + deviation["username"] = deviation["author"]["username"] + deviation["_username"] = deviation["username"].lower() + + deviation["date"] = dt = text.parse_datetime(deviation["ts"]) + deviation["published_time"] = int(util.datetime_to_timestamp(dt)) + + deviation["da_category"] = "Status" + deviation["category_path"] = "status" + deviation["is_downloadable"] = False + deviation["title"] = "Status Update" + + comments_count = deviation.pop("comments_count", 0) + deviation["stats"] = {"comments": comments_count} + if self.comments: + deviation["comments"] = ( + self.api.comments(deviation["statusid"], target="status") + if comments_count else () + ) + + class DeviantartPopularExtractor(DeviantartExtractor): """Extractor for popular deviations""" subcategory = "popular" @@ -867,7 +975,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor): archive_fmt = "g_{_username}_{index}.{extension}" pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" r"|(?:https?://)?(?:www\.)?deviantart\.com/" - r"(?:view/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)(\d+)") + r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)" + r"(\d+)" # bare deviation ID without slug + r"|(?:https?://)?fav\.me/d([0-9a-z]+)") # base36 test = ( (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), { "options": (("original", 0),), @@ -940,6 +1050,15 @@ class DeviantartDeviationExtractor(DeviantartExtractor): ("https://www.deviantart.com/view/1", { "exception": exception.NotFoundError, }), + # /deviation/ (#3558) + ("https://www.deviantart.com/deviation/817215762"), + # fav.me (#3558) + ("https://fav.me/ddijrpu", { + "count": 1, + }), + ("https://fav.me/dddd", { + "exception": exception.NotFoundError, + }), # old-style URLs ("https://shimoda7.deviantart.com" "/art/For-the-sake-of-a-memory-10073852"), @@ -956,7 +1075,8 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) self.type = match.group(3) - self.deviation_id = match.group(4) or match.group(5) + self.deviation_id = \ + match.group(4) or match.group(5) or id_from_base36(match.group(6)) def deviations(self): url = "{}/{}/{}/{}".format( @@ -1149,9 +1269,9 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - def comments_deviation(self, deviation_id, offset=0): - """Fetch comments posted on a deviation""" - endpoint = "/comments/deviation/" + deviation_id + def comments(self, id, target, offset=0): + """Fetch comments posted on a target""" + endpoint = "/comments/{}/{}".format(target, id) params = {"maxdepth": "5", "offset": offset, "limit": 50, "mature_content": self.mature} return self._pagination_list(endpoint, params=params, key="thread") @@ -1187,8 +1307,6 @@ class DeviantartOAuthAPI(): def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" - if not deviations: - return [] endpoint = "/deviation/metadata?" + "&".join( "deviationids[{}]={}".format(num, deviation["deviationid"]) for num, deviation in enumerate(deviations) @@ -1224,6 +1342,12 @@ class DeviantartOAuthAPI(): endpoint = "/user/profile/" + username return self._call(endpoint, fatal=False) + def user_statuses(self, username, offset=0): + """Yield status updates of a specific user""" + endpoint = "/user/statuses/" + params = {"username": username, "offset": offset, "limit": 50} + return self._pagination(endpoint, params) + def user_friends_watch(self, username): """Watch a user""" endpoint = "/user/friends/watch/" + username @@ -1350,10 +1474,12 @@ class DeviantartOAuthAPI(): "Private deviations detected! Run 'gallery-dl " "oauth:deviantart' and follow the instructions to " "be able to access them.") - if self.metadata: - self._metadata(results) - if self.folders: - self._folders(results) + # "statusid" cannot be used instead + if results and "deviationid" in results[0]: + if self.metadata: + self._metadata(results) + if self.folders: + self._folders(results) yield from results if not data["has_more"] and ( @@ -1561,6 +1687,17 @@ def _login_impl(extr, username, password): } +def id_from_base36(base36): + return util.bdecode(base36, _ALPHABET) + + +def base36_from_id(deviation_id): + return util.bencode(int(deviation_id), _ALPHABET) + + +_ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyz" + + ############################################################################### # Journal Formats ############################################################# diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index c05ec39..476fdeb 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -21,6 +21,10 @@ class FantiaExtractor(Extractor): _warning = True def items(self): + self.headers = { + "Accept" : "application/json, text/plain, */*", + "Referer": self.root, + } if self._warning: if not self._check_cookies(("_session_id",)): @@ -43,10 +47,11 @@ class FantiaExtractor(Extractor): def _pagination(self, url): params = {"page": 1} - headers = {"Referer": self.root} + headers = self.headers while True: page = self.request(url, params=params, headers=headers).text + self._csrf_token(page) post_id = None for post_id in text.extract_iter( @@ -57,11 +62,16 @@ class FantiaExtractor(Extractor): return params["page"] += 1 + def _csrf_token(self, page=None): + if not page: + page = self.request(self.root + "/").text + self.headers["X-CSRF-Token"] = text.extr( + page, 'name="csrf-token" content="', '"') + def _get_post_data(self, post_id): """Fetch and process post data""" - headers = {"Referer": self.root} url = self.root+"/api/v1/posts/"+post_id - resp = self.request(url, headers=headers).json()["post"] + resp = self.request(url, headers=self.headers).json()["post"] post = { "post_id": resp["id"], "post_url": self.root + "/posts/" + str(resp["id"]), @@ -173,4 +183,5 @@ class FantiaPostExtractor(FantiaExtractor): self.post_id = match.group(1) def posts(self): + self._csrf_token() return (self.post_id,) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 10c7295..9292da3 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -150,7 +150,7 @@ class GenericExtractor(Extractor): https://en.wikipedia.org/wiki/List_of_file_formats Compared to the "pattern" class variable, here we must exclude also - other special characters (space, ", ', >), since we are looking for + other special characters (space, ", ', <, >), since we are looking for urls in html tags. """ @@ -158,7 +158,7 @@ class GenericExtractor(Extractor): (?:[^?&#"'>\s]+) # anything until dot+extension \.(?:jpe?g|jpe|png|gif |web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions - (?:[^"'>\s]*)? # optional query and fragment + (?:[^"'<>\s]*)? # optional query and fragment """ imageurls_src = re.findall(imageurl_pattern_src, page) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index adee94a..d61c139 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -1,25 +1,26 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://hiperdex.com/""" +"""Extractors for https://1sthiperdex.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text from ..cache import memcache import re -BASE_PATTERN = r"((?:https?://)?(?:www\.)?hiperdex\d?\.(?:com|net|info))" +BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" + r"(?:1st)?hiperdex\d?\.(?:com|net|info))") class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://hiperdex.com" + root = "https://1sthiperdex.com" @memcache(keyarg=1) def manga_data(self, manga, page=None): @@ -52,6 +53,8 @@ class HiperdexBase(): } def chapter_data(self, chapter): + if chapter.startswith("chapter-"): + chapter = chapter[8:] chapter, _, minor = chapter.partition("-") data = { "chapter" : text.parse_int(chapter), @@ -62,12 +65,13 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): - """Extractor for manga chapters from hiperdex.com""" + """Extractor for manga chapters from 1sthiperdex.com""" pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" test = ( - ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", { - "pattern": r"https://hiperdex\d?.(com|net|info)/wp-content/uploads" - r"/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp", + ("https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/", { + "pattern": r"https://(1st)?hiperdex\d?.(com|net|info)" + r"/wp-content/uploads/WP-manga/data" + r"/manga_\w+/[0-9a-f]{32}/\d+\.webp", "count": 9, "keyword": { "artist" : "Sasuga Kei", @@ -82,6 +86,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): "type" : "Manga", }, }), + ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex2.com/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex.net/manga/domestic-na-kanojo/154-5/"), ("https://hiperdex.info/manga/domestic-na-kanojo/154-5/"), @@ -104,11 +109,11 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): - """Extractor for manga from hiperdex.com""" + """Extractor for manga from 1sthiperdex.com""" chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" test = ( - ("https://hiperdex.com/manga/youre-not-that-special/", { + ("https://1sthiperdex.com/manga/youre-not-that-special/", { "count": 51, "pattern": HiperdexChapterExtractor.pattern, "keyword": { @@ -125,6 +130,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): "type" : "Manhwa", }, }), + ("https://hiperdex.com/manga/youre-not-that-special/"), ("https://hiperdex2.com/manga/youre-not-that-special/"), ("https://hiperdex.net/manga/youre-not-that-special/"), ("https://hiperdex.info/manga/youre-not-that-special/"), @@ -166,6 +172,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): reverse = False pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" test = ( + ("https://1sthiperdex.com/manga-artist/beck-ho-an/"), ("https://hiperdex.net/manga-artist/beck-ho-an/"), ("https://hiperdex2.com/manga-artist/beck-ho-an/"), ("https://hiperdex.info/manga-artist/beck-ho-an/"), diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index eb64db0..7c656be 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text, exception +import binascii BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip" @@ -49,6 +50,11 @@ class HotleakExtractor(Extractor): params["page"] += 1 +def decode_video_url(url): + # cut first and last 16 characters, reverse, base64 decode + return binascii.a2b_base64(url[-17:15:-1]).decode() + + class HotleakPostExtractor(HotleakExtractor): """Extractor for individual posts on hotleak""" subcategory = "post" @@ -100,8 +106,8 @@ class HotleakPostExtractor(HotleakExtractor): text.nameext_from_url(data["url"], data) elif self.type == "video": - data["url"] = "ytdl:" + text.extr( - text.unescape(page), '"src":"', '"') + data["url"] = "ytdl:" + decode_video_url(text.extr( + text.unescape(page), '"src":"', '"')) text.nameext_from_url(data["url"], data) data["extension"] = "mp4" @@ -163,7 +169,8 @@ class HotleakCreatorExtractor(HotleakExtractor): elif post["type"] == 1: data["type"] = "video" - data["url"] = "ytdl:" + post["stream_url_play"] + data["url"] = "ytdl:" + decode_video_url( + post["stream_url_play"]) text.nameext_from_url(data["url"], data) data["extension"] = "mp4" diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index db9f3fb..deb31a0 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2018-2020 Leonardo Taccari -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -90,6 +90,11 @@ class InstagramExtractor(Extractor): def posts(self): return () + def finalize(self): + if self._cursor: + self.log.info("Use '-o cursor=%s' to continue downloading " + "from the current position", self._cursor) + def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -104,9 +109,6 @@ class InstagramExtractor(Extractor): page = None if page: - if self._cursor: - self.log.info("Use '-o cursor=%s' to continue downloading " - "from the current position", self._cursor) raise exception.StopExtraction("HTTP redirect to %s page (%s)", page, url.partition("?")[0]) @@ -114,6 +116,10 @@ class InstagramExtractor(Extractor): if www_claim is not None: self.www_claim = www_claim + csrf_token = response.cookies.get("csrftoken") + if csrf_token: + self.csrf_token = csrf_token + return response def login(self): @@ -794,7 +800,12 @@ class InstagramRestAPI(): def user_clips(self, user_id): endpoint = "/v1/clips/user/" - data = {"target_user_id": user_id, "page_size": "50"} + data = { + "target_user_id": user_id, + "page_size": "50", + "max_id": None, + "include_feed_video": "true", + } return self._pagination_post(endpoint, data) def user_collection(self, collection_id): @@ -820,19 +831,18 @@ class InstagramRestAPI(): def _call(self, endpoint, **kwargs): extr = self.extractor - url = "https://i.instagram.com/api" + endpoint + url = "https://www.instagram.com/api" + endpoint kwargs["headers"] = { + "Accept" : "*/*", "X-CSRFToken" : extr.csrf_token, "X-Instagram-AJAX": "1006242110", "X-IG-App-ID" : "936619743392459", "X-ASBD-ID" : "198387", "X-IG-WWW-Claim" : extr.www_claim, - "Origin" : extr.root, + "X-Requested-With": "XMLHttpRequest", + "Alt-Used" : "www.instagram.com", "Referer" : extr.root + "/", } - kwargs["cookies"] = { - "csrftoken": extr.csrf_token, - } return extr.request(url, **kwargs).json() def _pagination(self, endpoint, params=None, media=False): @@ -851,7 +861,7 @@ class InstagramRestAPI(): yield from data["items"] if not data.get("more_available"): - return + return extr._update_cursor(None) params["max_id"] = extr._update_cursor(data["next_max_id"]) def _pagination_post(self, endpoint, params): @@ -866,7 +876,7 @@ class InstagramRestAPI(): info = data["paging_info"] if not info.get("more_available"): - return + return extr._update_cursor(None) params["max_id"] = extr._update_cursor(info["max_id"]) def _pagination_sections(self, endpoint, params): @@ -879,7 +889,7 @@ class InstagramRestAPI(): yield from info["sections"] if not info.get("more_available"): - return + return extr._update_cursor(None) params["page"] = info["next_page"] params["max_id"] = extr._update_cursor(info["next_max_id"]) @@ -894,7 +904,7 @@ class InstagramRestAPI(): yield from item["media_items"] if "next_max_id" not in data: - return + return extr._update_cursor(None) params["max_id"] = extr._update_cursor(data["next_max_id"]) @@ -982,12 +992,7 @@ class InstagramGraphqlAPI(): "X-Requested-With": "XMLHttpRequest", "Referer" : extr.root + "/", } - cookies = { - "csrftoken": extr.csrf_token, - } - return extr.request( - url, params=params, headers=headers, cookies=cookies, - ).json()["data"] + return extr.request(url, params=params, headers=headers).json()["data"] def _pagination(self, query_hash, variables, key_data="user", key_edge=None): @@ -1003,7 +1008,7 @@ class InstagramGraphqlAPI(): info = data["page_info"] if not info["has_next_page"]: - return + return extr._update_cursor(None) elif not data["edges"]: s = "" if self.item.endswith("s") else "s" raise exception.StopExtraction( diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 63e3084..33e8370 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -16,6 +16,7 @@ import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.party" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" +HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" class KemonopartyExtractor(Extractor): @@ -41,7 +42,7 @@ class KemonopartyExtractor(Extractor): self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall - find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match + find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") comments = self.config("comments") @@ -89,10 +90,11 @@ class KemonopartyExtractor(Extractor): match = find_hash(url) if match: file["hash"] = hash = match.group(1) - if hash in hashes and not duplicates: - self.log.debug("Skipping %s (duplicate)", url) - continue - hashes.add(hash) + if not duplicates: + if hash in hashes: + self.log.debug("Skipping %s (duplicate)", url) + continue + hashes.add(hash) else: file["hash"] = "" @@ -362,14 +364,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "pattern": r"https://kemono\.party/data/(" r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|" r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)", + "keyword": {"hash": "re:e377e3525164559484ace2e64425b0cec1db08" + "|51453640a5e0a4d23fbf57fb85390f9c5ec154"}, "count": ">= 2", }), # 'inline' files (("https://kemono.party/discord" "/server/315262215055736843/channel/315262215055736843#general"), { "pattern": r"https://cdn\.discordapp\.com/attachments/\d+/\d+/.+$", - "range": "1-5", "options": (("image-filter", "type == 'inline'"),), + "keyword": {"hash": ""}, + "range": "1-5", }), ) @@ -383,6 +388,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall + find_hash = re.compile(HASH_PATTERN).match posts = self.posts() max_posts = self.config("max-posts") @@ -393,11 +399,13 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): files = [] append = files.append for attachment in post["attachments"]: + match = find_hash(attachment["path"]) + attachment["hash"] = match.group(1) if match else "" attachment["type"] = "attachment" append(attachment) for path in find_inline(post["content"] or ""): append({"path": "https://cdn.discordapp.com" + path, - "name": path, "type": "inline"}) + "name": path, "type": "inline", "hash": ""}) post["channel_name"] = self.channel_name post["date"] = text.parse_datetime( @@ -406,6 +414,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): yield Message.Directory, post for post["num"], file in enumerate(files, 1): + post["hash"] = file["hash"] post["type"] = file["type"] url = file["path"] diff --git a/gallery_dl/extractor/lexica.py b/gallery_dl/extractor/lexica.py new file mode 100644 index 0000000..ad93625 --- /dev/null +++ b/gallery_dl/extractor/lexica.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://lexica.art/""" + +from .common import Extractor, Message +from .. import text + + +class LexicaSearchExtractor(Extractor): + """Extractor for lexica.art search results""" + category = "lexica" + subcategory = "search" + root = "https://lexica.art" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "{id}" + pattern = r"(?:https?://)?lexica\.art/?\?q=([^&#]+)" + test = ( + ("https://lexica.art/?q=tree", { + "pattern": r"https://lexica-serve-encoded-images2\.sharif\." + r"workers.dev/full_jpg/[0-9a-f-]{36}$", + "range": "1-80", + "count": 80, + "keyword": { + "height": int, + "id": str, + "upscaled_height": int, + "upscaled_width": int, + "userid": str, + "width": int, + "prompt": { + "c": int, + "grid": bool, + "height": int, + "id": str, + "images": list, + "initImage": None, + "initImageStrength": None, + "model": "lexica-aperture-v2", + "negativePrompt": str, + "prompt": str, + "seed": str, + "timestamp": r"re:\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d.\d\d\dZ", + "width": int, + }, + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) + self.text = text.unquote(self.query).replace("+", " ") + + def items(self): + base = ("https://lexica-serve-encoded-images2.sharif.workers.dev" + "/full_jpg/") + tags = self.text + + for image in self.posts(): + image["filename"] = image["id"] + image["extension"] = "jpg" + image["search_tags"] = tags + yield Message.Directory, image + yield Message.Url, base + image["id"], image + + def posts(self): + url = self.root + "/api/infinite-prompts" + headers = { + "Accept" : "application/json, text/plain, */*", + "Referer": "{}/?q={}".format(self.root, self.query), + } + json = { + "text" : self.text, + "searchMode": "images", + "source" : "search", + "cursor" : 0, + "model" : "lexica-aperture-v2", + } + + while True: + data = self.request( + url, method="POST", headers=headers, json=json).json() + + prompts = { + prompt["id"]: prompt + for prompt in data["prompts"] + } + + for image in data["images"]: + image["prompt"] = prompts[image["promptid"]] + del image["promptid"] + yield image + + cursor = data.get("nextCursor") + if not cursor: + return + + json["cursor"] = cursor diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 049e0af..e49d29a 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -31,8 +31,8 @@ class MastodonExtractor(BaseExtractor): def items(self): for status in self.statuses(): - if self._check_move: - self._check_move(status["account"]) + if self._check_moved: + self._check_moved(status["account"]) if not self.reblogs and status["reblog"]: self.log.debug("Skipping %s (reblog)", status["id"]) continue @@ -48,12 +48,13 @@ class MastodonExtractor(BaseExtractor): status["instance_remote"] = \ acct.rpartition("@")[2] if "@" in acct else None + status["count"] = len(attachments) status["tags"] = [tag["name"] for tag in status["tags"]] status["date"] = text.parse_datetime( status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") yield Message.Directory, status - for media in attachments: + for status["num"], media in enumerate(attachments, 1): status["media"] = media url = media["url"] yield Message.Url, url, text.nameext_from_url(url, status) @@ -62,8 +63,8 @@ class MastodonExtractor(BaseExtractor): """Return an iterable containing all relevant Status objects""" return () - def _check_move(self, account): - self._check_move = None + def _check_moved(self, account): + self._check_moved = None if "moved" in account: self.log.warning("Account '%s' moved to '%s'", account["acct"], account["moved"]["acct"]) @@ -181,6 +182,10 @@ class MastodonStatusExtractor(MastodonExtractor): test = ( ("https://mastodon.social/@jk/103794036899778366", { "count": 4, + "keyword": { + "count": 4, + "num": int, + }, }), ("https://pawoo.net/@yoru_nine/105038878897832922", { "content": "b52e807f8ab548d6f896b09218ece01eba83987a", @@ -229,7 +234,7 @@ class MastodonAPI(): for account in self.account_search(handle, 1): if account["acct"] == username: - self.extractor._check_move(account) + self.extractor._check_moved(account) return account["id"] raise exception.NotFoundError("account") diff --git a/gallery_dl/extractor/nudecollect.py b/gallery_dl/extractor/nudecollect.py new file mode 100644 index 0000000..3159919 --- /dev/null +++ b/gallery_dl/extractor/nudecollect.py @@ -0,0 +1,142 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nudecollect.com/""" + +from .common import GalleryExtractor +from .. import text + + +class NudecollectExtractor(GalleryExtractor): + """Base class for Nudecollect extractors""" + category = "nudecollect" + directory_fmt = ("{category}", "{title}") + filename_fmt = "{slug}_{num:>03}.{extension}" + archive_fmt = "{slug}_{num}" + root = "https://www.nudecollect.com" + + def request(self, url, **kwargs): + kwargs["allow_redirects"] = False + return GalleryExtractor.request(self, url, **kwargs) + + @staticmethod + def get_title(page): + return text.unescape(text.extr(page, "<title>", "</title>"))[31:] + + @staticmethod + def get_image(page): + return text.extr(page, '<img src="', '"') + + +class NudecollectImageExtractor(NudecollectExtractor): + """Extractor for individual images from nudecollect.com""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com" + r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)" + r"-mirror-(\d+)\.html)") + test = ( + (("https://www.nudecollect.com/content/20201220_Teenpornstorage_" + "Patritcy_Vanessa_Lesbian_Lust/image-4-pics-108-mirror-43.html"), { + "pattern": (r"https://mirror\d+\.nudecollect\.com/showimage" + r"/nudecollect-8769086487/image00004-5896498214-43" + r"-9689595623/20201220_Teenpornstorage_Patritcy_Vaness" + r"a_Lesbian_Lust/9879560327/nudecollect\.com\.jpg"), + "keyword": { + "slug" : ("20201220_Teenpornstorage_Patritcy" + "_Vanessa_Lesbian_Lust"), + "title" : ("20201220 Teenpornstorage Patritcy" + " Vanessa Lesbian Lust"), + "num" : 4, + "count" : 108, + "mirror": 43, + }, + }), + (("https://www.nudecollect.com/content/20201220_Teenpornstorage_" + "Patritcy_Vanessa_Lesbian_Lust/image-10-pics-108-mirror-43.html")), + ) + + def __init__(self, match): + NudecollectExtractor.__init__(self, match) + _, self.slug, self.num, self.count, self.mirror = match.groups() + + def metadata(self, page): + return { + "slug" : self.slug, + "title" : self.get_title(page), + "count" : text.parse_int(self.count), + "mirror": text.parse_int(self.mirror), + } + + def images(self, page): + return ((self.get_image(page), {"num": text.parse_int(self.num)}),) + + +class NudecollectAlbumExtractor(NudecollectExtractor): + """Extractor for image albums on nudecollect.com""" + subcategory = "album" + pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com" + r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)" + r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html") + test = ( + (("https://www.nudecollect.com/content/20170219_TheWhiteBoxxx_" + "Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex" + "_with_alluring_Czech_babes_x125_1080px/index-mirror-67-125.html"), { + "pattern": (r"https://mirror\d+\.nudecollect\.com/showimage" + r"/nudecollect-8769086487/image00\d\d\d-5896498214-67" + r"-9689595623/20170219_TheWhiteBoxxx_Caprice" + r"_Tracy_Loves_Hot_ass_fingering_and_sensual_" + r"lesbian_sex_with_alluring_Czech_babes_x125_1080px" + r"/9879560327/nudecollect\.com\.jpg"), + "count" : 125, + "keyword": { + "slug" : ("20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_" + "ass_fingering_and_sensual_lesbian_sex_with_" + "alluring_Czech_babes_x125_1080px"), + "title" : ("20170219 TheWhiteBoxxx Caprice Tracy Loves Hot ass" + " fingering and sensual lesbian sex with alluring" + " Czech babes x125 1080px"), + "num" : int, + "mirror": 67, + }, + }), + (("https://www.nudecollect.com/content/20201220_Teenpornstorage_" + "Patritcy_Vanessa_Lesbian_Lust/page-1-pics-108-mirror-43.html"), { + "pattern": (r"https://mirror\d+\.nudecollect\.com/showimage" + r"/nudecollect-8769086487/image00\d\d\d-5896498214-43" + r"-9689595623/20201220_Teenpornstorage_Patritcy_Vaness" + r"a_Lesbian_Lust/9879560327/nudecollect\.com\.jpg"), + "count" : 108, + "keyword": { + "slug" : ("20201220_Teenpornstorage_Patritcy" + "_Vanessa_Lesbian_Lust"), + "title" : ("20201220 Teenpornstorage Patritcy" + " Vanessa Lesbian Lust"), + "num" : int, + "mirror": 43, + }, + }), + ) + + def __init__(self, match): + self.slug = match.group(1) + self.mirror = match.group(2) or match.group(5) + self.count = text.parse_int(match.group(3) or match.group(4)) + url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format( + self.root, self.slug, self.count, self.mirror) + NudecollectExtractor.__init__(self, match, url) + + def metadata(self, page): + return { + "slug" : self.slug, + "title" : self.get_title(page), + "mirror": text.parse_int(self.mirror), + } + + def images(self, page): + url = self.get_image(page) + p1, _, p2 = url.partition("/image0") + ufmt = p1 + "/image{:>05}" + p2[4:] + return [(ufmt.format(num), None) for num in range(1, self.count + 1)] diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index d6628c4..9270f33 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -68,11 +68,19 @@ class OAuthBase(Extractor): def open(self, url, params, recv=None): """Open 'url' in browser amd return response parameters""" - import webbrowser url += "?" + urllib.parse.urlencode(params) - if not self.config("browser", True) or not webbrowser.open(url): - stdout_write( - "Please open this URL in your browser:\n\n" + url + "\n\n") + + browser = self.config("browser", True) + if browser: + import webbrowser + browser = webbrowser.get() + + if browser and browser.open(url): + self.log.info("Opening URL in %s:", browser.name.capitalize()) + else: + self.log.info("Please open this URL in your browser:") + + stdout_write("\n{}\n\n".format(url)) return (recv or self.recv)() def error(self, msg): @@ -80,8 +88,18 @@ class OAuthBase(Extractor): "Remote server reported an error:\n\n{}\n".format(msg)) def _oauth1_authorization_flow( - self, request_token_url, authorize_url, access_token_url): + self, default_key, default_secret, + request_token_url, authorize_url, access_token_url): """Perform the OAuth 1.0a authorization flow""" + + api_key = self.oauth_config("api-key") or default_key + api_secret = self.oauth_config("api-secret") or default_secret + self.session = oauth.OAuth1Session(api_key, api_secret) + + self.log.info("Using %s %s API key (%s)", + "default" if api_key == default_key else "custom", + self.subcategory, api_key) + # get a request token params = {"oauth_callback": self.redirect_uri} data = self.session.get(request_token_url, params=params).text @@ -112,11 +130,18 @@ class OAuthBase(Extractor): )) def _oauth2_authorization_code_grant( - self, client_id, client_secret, auth_url, token_url, *, - scope="read", key="refresh_token", auth=True, - cache=None, instance=None): + self, client_id, client_secret, default_id, default_secret, + auth_url, token_url, *, scope="read", duration="permanent", + key="refresh_token", auth=True, cache=None, instance=None): """Perform an OAuth2 authorization code grant""" + client_id = str(client_id) if client_id else default_id + client_secret = client_secret or default_secret + + self.log.info("Using %s %s client ID (%s)", + "default" if client_id == default_id else "custom", + instance or self.subcategory, client_id) + state = "gallery-dl_{}_{}".format( self.subcategory, oauth.nonce(8), @@ -127,7 +152,7 @@ class OAuthBase(Extractor): "response_type": "code", "state" : state, "redirect_uri" : self.redirect_uri, - "duration" : "permanent", + "duration" : duration, "scope" : scope, } @@ -137,13 +162,12 @@ class OAuthBase(Extractor): # check authorization response if state != params.get("state"): self.send("'state' mismatch: expected {}, got {}.\n".format( - state, params.get("state") - )) + state, params.get("state"))) return if "error" in params: return self.error(params) - # exchange the authorization code for a token + # exchange authorization code for a token data = { "grant_type" : "authorization_code", "code" : params["code"], @@ -208,81 +232,36 @@ class OAuthBase(Extractor): return msg -class OAuthDeviantart(OAuthBase): - subcategory = "deviantart" - pattern = "oauth:deviantart$" - redirect_uri = REDIRECT_URI_HTTPS - - def items(self): - yield Message.Version, 1 - - self._oauth2_authorization_code_grant( - self.oauth_config( - "client-id", deviantart.DeviantartOAuthAPI.CLIENT_ID), - self.oauth_config( - "client-secret", deviantart.DeviantartOAuthAPI.CLIENT_SECRET), - "https://www.deviantart.com/oauth2/authorize", - "https://www.deviantart.com/oauth2/token", - scope="browse user.manage", - cache=deviantart._refresh_token_cache, - ) - +# -------------------------------------------------------------------- +# OAuth 1.0a class OAuthFlickr(OAuthBase): subcategory = "flickr" pattern = "oauth:flickr$" redirect_uri = REDIRECT_URI_HTTPS - def __init__(self, match): - OAuthBase.__init__(self, match) - self.session = oauth.OAuth1Session( - self.oauth_config("api-key", flickr.FlickrAPI.API_KEY), - self.oauth_config("api-secret", flickr.FlickrAPI.API_SECRET), - ) - def items(self): yield Message.Version, 1 self._oauth1_authorization_flow( + flickr.FlickrAPI.API_KEY, + flickr.FlickrAPI.API_SECRET, "https://www.flickr.com/services/oauth/request_token", "https://www.flickr.com/services/oauth/authorize", "https://www.flickr.com/services/oauth/access_token", ) -class OAuthReddit(OAuthBase): - subcategory = "reddit" - pattern = "oauth:reddit$" - - def items(self): - yield Message.Version, 1 - - self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT - self._oauth2_authorization_code_grant( - self.oauth_config("client-id", reddit.RedditAPI.CLIENT_ID), - "", - "https://www.reddit.com/api/v1/authorize", - "https://www.reddit.com/api/v1/access_token", - scope="read history", - cache=reddit._refresh_token_cache, - ) - - class OAuthSmugmug(OAuthBase): subcategory = "smugmug" pattern = "oauth:smugmug$" - def __init__(self, match): - OAuthBase.__init__(self, match) - self.session = oauth.OAuth1Session( - self.oauth_config("api-key", smugmug.SmugmugAPI.API_KEY), - self.oauth_config("api-secret", smugmug.SmugmugAPI.API_SECRET), - ) - def items(self): yield Message.Version, 1 self._oauth1_authorization_flow( + smugmug.SmugmugAPI.API_KEY, + smugmug.SmugmugAPI.API_SECRET, "https://api.smugmug.com/services/oauth/1.0a/getRequestToken", "https://api.smugmug.com/services/oauth/1.0a/authorize", "https://api.smugmug.com/services/oauth/1.0a/getAccessToken", @@ -293,23 +272,61 @@ class OAuthTumblr(OAuthBase): subcategory = "tumblr" pattern = "oauth:tumblr$" - def __init__(self, match): - OAuthBase.__init__(self, match) - self.session = oauth.OAuth1Session( - self.oauth_config("api-key", tumblr.TumblrAPI.API_KEY), - self.oauth_config("api-secret", tumblr.TumblrAPI.API_SECRET), - ) - def items(self): yield Message.Version, 1 self._oauth1_authorization_flow( + tumblr.TumblrAPI.API_KEY, + tumblr.TumblrAPI.API_SECRET, "https://www.tumblr.com/oauth/request_token", "https://www.tumblr.com/oauth/authorize", "https://www.tumblr.com/oauth/access_token", ) +# -------------------------------------------------------------------- +# OAuth 2.0 + +class OAuthDeviantart(OAuthBase): + subcategory = "deviantart" + pattern = "oauth:deviantart$" + redirect_uri = REDIRECT_URI_HTTPS + + def items(self): + yield Message.Version, 1 + + self._oauth2_authorization_code_grant( + self.oauth_config("client-id"), + self.oauth_config("client-secret"), + deviantart.DeviantartOAuthAPI.CLIENT_ID, + deviantart.DeviantartOAuthAPI.CLIENT_SECRET, + "https://www.deviantart.com/oauth2/authorize", + "https://www.deviantart.com/oauth2/token", + scope="browse user.manage", + cache=deviantart._refresh_token_cache, + ) + + +class OAuthReddit(OAuthBase): + subcategory = "reddit" + pattern = "oauth:reddit$" + + def items(self): + yield Message.Version, 1 + + self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT + self._oauth2_authorization_code_grant( + self.oauth_config("client-id"), + "", + reddit.RedditAPI.CLIENT_ID, + "", + "https://www.reddit.com/api/v1/authorize", + "https://www.reddit.com/api/v1/access_token", + scope="read history", + cache=reddit._refresh_token_cache, + ) + + class OAuthMastodon(OAuthBase): subcategory = "mastodon" pattern = "oauth:mastodon:(?:https?://)?([^/?#]+)" @@ -330,6 +347,8 @@ class OAuthMastodon(OAuthBase): self._oauth2_authorization_code_grant( application["client-id"], application["client-secret"], + application["client-id"], + application["client-secret"], "https://{}/oauth/authorize".format(self.instance), "https://{}/oauth/token".format(self.instance), instance=self.instance, @@ -362,6 +381,8 @@ class OAuthMastodon(OAuthBase): return data +# -------------------------------------------------------------------- + class OAuthPixiv(OAuthBase): subcategory = "pixiv" pattern = "oauth:pixiv$" diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index fc85125..df85b96 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -56,12 +56,12 @@ class PhilomenaExtractor(BooruExtractor): INSTANCES = { "derpibooru": { "root": "https://derpibooru.org", - "pattern": r"derpibooru\.org", + "pattern": r"(?:www\.)?derpibooru\.org", "filter_id": "56027", }, "ponybooru": { "root": "https://ponybooru.org", - "pattern": r"ponybooru\.org", + "pattern": r"(?:www\.)?ponybooru\.org", "filter_id": "2", }, "furbooru": { @@ -128,9 +128,14 @@ class PhilomenaPostExtractor(PhilomenaExtractor): }, }), ("https://derpibooru.org/1"), + ("https://www.derpibooru.org/1"), + ("https://www.derpibooru.org/images/1"), + ("https://ponybooru.org/images/1", { "content": "bca26f58fafd791fe07adcd2a28efd7751824605", }), + ("https://www.ponybooru.org/images/1"), + ("https://furbooru.org/images/1", { "content": "9eaa1e1b32fa0f16520912257dbefaff238d5fd2", }), diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 7013f1b..ea4cf43 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -80,15 +80,19 @@ class SankakuTagExtractor(SankakuExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/\?([^#]*)" + pattern = BASE_PATTERN + r"/?\?([^#]*)" test = ( ("https://sankaku.app/?tags=bonocho", { "count": 5, "pattern": r"https://s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" - r"/[^/]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+", + r"/[0-9a-f]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+", }), ("https://beta.sankakucomplex.com/?tags=bonocho"), ("https://chan.sankakucomplex.com/?tags=bonocho"), + ("https://black.sankakucomplex.com/?tags=bonocho"), + ("https://white.sankakucomplex.com/?tags=bonocho"), + ("https://sankaku.app/ja?tags=order%3Apopularity"), + ("https://sankaku.app/no/?tags=order%3Apopularity"), # error on five or more tags ("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", { "options": (("username", None),), diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c2d8247..17a2202 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -26,6 +26,7 @@ class TwitterExtractor(Extractor): cookiedomain = ".twitter.com" cookienames = ("auth_token",) root = "https://twitter.com" + browser = "firefox" def __init__(self, match): Extractor.__init__(self, match) @@ -945,16 +946,31 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor - self.root = "https://twitter.com/i/api" + self.root = "https://api.twitter.com" + cookies = extractor.session.cookies + cookiedomain = extractor.cookiedomain + + csrf = extractor.config("csrf") + if csrf is None or csrf == "cookies": + csrf_token = cookies.get("ct0", domain=cookiedomain) + else: + csrf_token = None + if not csrf_token: + csrf_token = util.generate_token() + cookies.set("ct0", csrf_token, domain=cookiedomain) + + auth_token = cookies.get("auth_token", domain=cookiedomain) + self.headers = { "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" "4FA33AGWWjCpTnA", "x-guest-token": None, - "x-twitter-auth-type": None, + "x-twitter-auth-type": "OAuth2Session" if auth_token else None, "x-twitter-client-language": "en", "x-twitter-active-user": "yes", - "x-csrf-token": None, + "x-csrf-token": csrf_token, + "Origin": "https://twitter.com", "Referer": "https://twitter.com/", } self.params = { @@ -967,24 +983,36 @@ class TwitterAPI(): "include_can_dm": "1", "include_can_media_tag": "1", "include_ext_has_nft_avatar": "1", + "include_ext_is_blue_verified": "1", + "include_ext_verified_type": "1", "skip_status": "1", "cards_platform": "Web-12", "include_cards": "1", "include_ext_alt_text": "true", + "include_ext_limited_action_results": "false", "include_quote_count": "true", "include_reply_count": "1", "tweet_mode": "extended", + "include_ext_collab_control": "true", + "include_ext_views": "true", "include_entities": "true", "include_user_entities": "true", "include_ext_media_color": "true", "include_ext_media_availability": "true", "include_ext_sensitive_media_warning": "true", + "include_ext_trusted_friends_metadata": "true", "send_error_codes": "true", "simple_quoted_tweet": "true", + "q": None, "count": "100", + "query_source": None, "cursor": None, - "ext": "mediaStats,highlightedLabel,hasNftAvatar," - "voiceInfo,superFollowMetadata", + "pc": None, + "spelling_corrections": None, + "include_ext_edit_control": "true", + "ext": "mediaStats,highlightedLabel,hasNftAvatar,voiceInfo," + "enrichments,superFollowMetadata,unmentionInfo,editControl," + "collab_control,vibe", } self.variables = { "includePromotedContent": False, @@ -1006,28 +1034,6 @@ class TwitterAPI(): self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode - cookies = extractor.session.cookies - cookiedomain = extractor.cookiedomain - - csrf = extractor.config("csrf") - if csrf is None or csrf == "cookies": - csrf_token = cookies.get("ct0", domain=cookiedomain) - else: - csrf_token = None - if not csrf_token: - csrf_token = util.generate_token() - cookies.set("ct0", csrf_token, domain=cookiedomain) - self.headers["x-csrf-token"] = csrf_token - - if cookies.get("auth_token", domain=cookiedomain): - # logged in - self.headers["x-twitter-auth-type"] = "OAuth2Session" - else: - # guest - guest_token = self._guest_token() - cookies.set("gt", guest_token, domain=cookiedomain) - self.headers["x-guest-token"] = guest_token - def tweet_detail(self, tweet_id): endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail" variables = { @@ -1183,17 +1189,26 @@ class TwitterAPI(): @cache(maxage=3600) def _guest_token(self): - root = "https://api.twitter.com" endpoint = "/1.1/guest/activate.json" - return str(self._call(endpoint, None, root, "POST")["guest_token"]) + self.extractor.log.info("Requesting guest token") + return str(self._call(endpoint, None, "POST", False)["guest_token"]) + + def _authenticate_guest(self): + guest_token = self._guest_token() + if guest_token != self.headers["x-guest-token"]: + self.headers["x-guest-token"] = guest_token + self.extractor.session.cookies.set( + "gt", guest_token, domain=self.extractor.cookiedomain) - def _call(self, endpoint, params, root=None, method="GET"): - if root is None: - root = self.root + def _call(self, endpoint, params, method="GET", auth=True): + url = self.root + endpoint while True: + if not self.headers["x-twitter-auth-type"] and auth: + self._authenticate_guest() + response = self.extractor.request( - root + endpoint, method=method, params=params, + url, method=method, params=params, headers=self.headers, fatal=None) # update 'x-csrf-token' header (#1170) @@ -1226,21 +1241,33 @@ class TwitterAPI(): def _pagination_legacy(self, endpoint, params): original_retweets = (self.extractor.retweets == "original") + bottom = ("cursor-bottom-", "sq-cursor-bottom") while True: data = self._call(endpoint, params) - instr = data["timeline"]["instructions"] - if not instr: + instructions = data["timeline"]["instructions"] + if not instructions: return tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] tweet_id = cursor = None tweet_ids = [] + entries = () + + # process instructions + for instr in instructions: + if "addEntries" in instr: + entries = instr["addEntries"]["entries"] + elif "replaceEntry" in instr: + entry = instr["replaceEntry"]["entry"] + if entry["entryId"].startswith(bottom): + cursor = (entry["content"]["operation"] + ["cursor"]["value"]) # collect tweet IDs and cursor value - for entry in instr[0]["addEntries"]["entries"]: + for entry in entries: entry_startswith = entry["entryId"].startswith if entry_startswith(("tweet-", "sq-I-t-")): @@ -1252,7 +1279,7 @@ class TwitterAPI(): entry["content"]["timelineModule"]["metadata"] ["conversationMetadata"]["allTweetIds"][::-1]) - elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")): + elif entry_startswith(bottom): cursor = entry["content"]["operation"]["cursor"] if not cursor.get("stopOnEmptyResponse", True): # keep going even if there are no tweets @@ -1300,11 +1327,7 @@ class TwitterAPI(): quoted["quoted_by_id_str"] = tweet["id_str"] yield quoted - # update cursor value - if "replaceEntry" in instr[-1] : - cursor = (instr[-1]["replaceEntry"]["entry"] - ["content"]["operation"]["cursor"]["value"]) - + # stop on empty response if not cursor or (not tweets and not tweet_id): return params["cursor"] = cursor @@ -1346,12 +1369,8 @@ class TwitterAPI(): if user.get("blocked_by"): if self.headers["x-twitter-auth-type"] and \ extr.config("logout"): - guest_token = self._guest_token() - extr.session.cookies.set( - "gt", guest_token, domain=extr.cookiedomain) extr._cookiefile = None del extr.session.cookies["auth_token"] - self.headers["x-guest-token"] = guest_token self.headers["x-twitter-auth-type"] = None extr.log.info("Retrying API request as guest") continue @@ -1578,8 +1597,6 @@ def _login_impl(extr, username, password): "Login with email is no longer possible. " "You need to provide your username or phone number instead.") - extr.log.info("Logging in as %s", username) - def process(response): try: data = response.json() @@ -1598,8 +1615,10 @@ def _login_impl(extr, username, password): extr.session.cookies.clear() api = TwitterAPI(extr) + api._authenticate_guest() headers = api.headers - headers["Referer"] = "https://twitter.com/i/flow/login" + + extr.log.info("Logging in as %s", username) # init data = { @@ -1653,7 +1672,7 @@ def _login_impl(extr, username, password): "web_modal": 1, }, } - url = "https://twitter.com/i/api/1.1/onboarding/task.json?flow_name=login" + url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login" response = extr.request(url, method="POST", headers=headers, json=data) data = { @@ -1668,7 +1687,7 @@ def _login_impl(extr, username, password): }, ], } - url = "https://twitter.com/i/api/1.1/onboarding/task.json" + url = "https://api.twitter.com/1.1/onboarding/task.json" response = extr.request( url, method="POST", headers=headers, json=data, fatal=None) @@ -1692,7 +1711,7 @@ def _login_impl(extr, username, password): }, ], } - # url = "https://twitter.com/i/api/1.1/onboarding/task.json" + # url = "https://api.twitter.com/1.1/onboarding/task.json" extr.sleep(random.uniform(2.0, 4.0), "login (username)") response = extr.request( url, method="POST", headers=headers, json=data, fatal=None) @@ -1710,7 +1729,7 @@ def _login_impl(extr, username, password): }, ], } - # url = "https://twitter.com/i/api/1.1/onboarding/task.json" + # url = "https://api.twitter.com/1.1/onboarding/task.json" extr.sleep(random.uniform(2.0, 4.0), "login (password)") response = extr.request( url, method="POST", headers=headers, json=data, fatal=None) @@ -1727,7 +1746,7 @@ def _login_impl(extr, username, password): }, ], } - # url = "https://twitter.com/i/api/1.1/onboarding/task.json" + # url = "https://api.twitter.com/1.1/onboarding/task.json" response = extr.request( url, method="POST", headers=headers, json=data, fatal=None) process(response) diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py new file mode 100644 index 0000000..70e9646 --- /dev/null +++ b/gallery_dl/extractor/wikifeet.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.wikifeet.com/""" + +from .common import GalleryExtractor +from .. import text +import json + + +class WikifeetGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from wikifeet.com""" + category = "wikifeet" + directory_fmt = ("{category}", "{celebrity}") + filename_fmt = "{category}_{celeb}_{pid}.{extension}" + archive_fmt = "{type}_{celeb}_{pid}" + pattern = (r"(?:https?://)(?:(?:www\.)?wikifeetx?|" + r"men\.wikifeet)\.com/([^/?#]+)") + test = ( + ("https://www.wikifeet.com/Madison_Beer", { + "pattern": (r"https://pics\.wikifeet\.com/Madison_Beer" + r"-Feet-\d+\.jpg"), + "count" : ">= 352", + "keyword": { + "celeb" : "Madison_Beer", + "celebrity" : "Madison Beer", + "birthday" : "dt:1999-03-05 00:00:00", + "birthplace": "United States", + "rating" : float, + "pid" : int, + "width" : int, + "height" : int, + "shoesize" : "7.5 US", + "type" : "women", + "tags" : list, + }, + }), + ("https://www.wikifeetx.com/Tifa_Quinn", { + "pattern": (r"https://pics\.wikifeet\.com/Tifa_Quinn" + r"-Feet-\d+\.jpg"), + "count" : ">= 9", + "keyword": { + "celeb" : "Tifa_Quinn", + "celebrity" : "Tifa Quinn", + "birthday" : "[NOT SET]", + "birthplace": "United States", + "rating" : float, + "pid" : int, + "width" : int, + "height" : int, + "shoesize" : "[NOT SET]", + "type" : "women", + "tags" : list, + }, + }), + ("https://men.wikifeet.com/Chris_Hemsworth", { + "pattern": (r"https://pics\.wikifeet\.com/Chris_Hemsworth" + r"-Feet-\d+\.jpg"), + "count" : ">= 860", + "keyword": { + "celeb" : "Chris_Hemsworth", + "celebrity" : "Chris Hemsworth", + "birthday" : "dt:1983-08-11 00:00:00", + "birthplace": "Australia", + "rating" : float, + "pid" : int, + "width" : int, + "height" : int, + "shoesize" : "12.5 US", + "type" : "men", + "tags" : list, + }, + }), + ) + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + if "wikifeetx.com" in self.root: + self.category = "wikifeetx" + self.type = "men" if "://men." in self.root else "women" + self.celeb = match.group(1) + GalleryExtractor.__init__(self, match, self.root + "/" + self.celeb) + + def metadata(self, page): + extr = text.extract_from(page) + return { + "celeb" : self.celeb, + "type" : self.type, + "rating" : text.parse_float(extr('"ratingValue": "', '"')), + "celebrity" : text.unescape(extr("times'>", "</h1>")), + "shoesize" : text.remove_html(extr("Shoe Size:", "edit")), + "birthplace": text.remove_html(extr("Birthplace:", "edit")), + "birthday" : text.parse_datetime(text.remove_html( + extr("Birth Date:", "edit")), "%Y-%m-%d"), + } + + def images(self, page): + tagmap = { + "C": "Close-up", + "T": "Toenails", + "N": "Nylons", + "A": "Arches", + "S": "Soles", + "B": "Barefoot", + } + ufmt = "https://pics.wikifeet.com/" + self.celeb + "-Feet-{}.jpg" + return [ + (ufmt.format(data["pid"]), { + "pid" : data["pid"], + "width" : data["pw"], + "height": data["ph"], + "tags" : [tagmap[tag] for tag in data["tags"]], + }) + for data in json.loads(text.extr(page, "['gdata'] = ", ";")) + ] |
