diff options
| author | 2023-03-13 02:07:49 -0400 | |
|---|---|---|
| committer | 2023-03-13 02:07:49 -0400 | |
| commit | 10987f08f8b6c510ba64f4b42d95ba67eec6e5b0 (patch) | |
| tree | 1af82cad9ac859a70cafc976a980280b939cfcc7 /gallery_dl/extractor | |
| parent | 919f8ba16a7b82ba1099bd25b2c61c7881a05aa2 (diff) | |
New upstream version 1.25.0.upstream/1.25.0
Diffstat (limited to 'gallery_dl/extractor')
67 files changed, 1732 insertions, 600 deletions
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index b2ae963..1213194 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ """Extractors for https://500px.com/""" from .common import Extractor, Message -import json +from .. import util BASE_PATTERN = r"(?:https?://)?(?:web\.)?500px\.com" @@ -86,7 +86,7 @@ class _500pxExtractor(Extractor): } data = { "operationName": opname, - "variables" : json.dumps(variables), + "variables" : util.json_dumps(variables), "query" : QUERIES[opname], } return self.request( diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index fed4991..26ac8b2 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://comics.8muses.com/""" from .common import Extractor, Message -from .. import text -import json +from .. import text, util class _8musesAlbumExtractor(Extractor): @@ -131,7 +130,7 @@ class _8musesAlbumExtractor(Extractor): @staticmethod def _unobfuscate(data): - return json.loads("".join([ + return util.json_loads("".join([ chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c for c in text.unescape(data.strip("\t\n\r !")) ])) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6140c2c..3968d72 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. +import sys import re modules = [ @@ -34,6 +35,7 @@ modules = [ "desktopography", "deviantart", "dynastyscans", + "e621", "erome", "exhentai", "fallenangels", @@ -92,6 +94,7 @@ modules = [ "mangasee", "mangoxo", "mememuseum", + "misskey", "myhentaigallery", "myportfolio", "nana", @@ -118,6 +121,7 @@ modules = [ "plurk", "poipiku", "pornhub", + "pornpics", "pururin", "reactor", "readcomiconline", @@ -137,6 +141,7 @@ modules = [ "soundgasm", "speakerdeck", "subscribestar", + "szurubooru", "tapas", "tcbscans", "telegraph", @@ -217,20 +222,33 @@ def extractors(): # -------------------------------------------------------------------- # internals -_cache = [] -_module_iter = iter(modules) - def _list_classes(): - """Yield all available extractor classes""" + """Yield available extractor classes""" yield from _cache - globals_ = globals() - for module_name in _module_iter: - module = __import__(module_name, globals_, None, (), 1) + for module in _module_iter: yield from add_module(module) - globals_["_list_classes"] = lambda : _cache + globals()["_list_classes"] = lambda : _cache + + +def _modules_internal(): + globals_ = globals() + for module_name in modules: + yield __import__(module_name, globals_, None, (), 1) + + +def _modules_path(path, files): + sys.path.insert(0, path) + try: + return [ + __import__(name[:-3]) + for name in files + if name.endswith(".py") + ] + finally: + del sys.path[0] def _get_classes(module): @@ -240,3 +258,7 @@ def _get_classes(module): hasattr(cls, "pattern") and cls.__module__ == module.__name__ ) ] + + +_cache = [] +_module_iter = _modules_internal() diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 1b49d6a..638fedc 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, util -import json BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/" @@ -38,7 +37,7 @@ class BbcGalleryExtractor(GalleryExtractor): ) def metadata(self, page): - data = json.loads(text.extr( + data = util.json_loads(text.extr( page, '<script type="application/ld+json">', '</script>')) return { "programme": self.gallery_url.split("/")[4], diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index 44d6065..d6adb4e 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://bcy.net/""" from .common import Extractor, Message -from .. import text, exception -import json +from .. import text, util, exception import re @@ -100,9 +99,9 @@ class BcyExtractor(Extractor): .replace('\\\\u002F', '/') .replace('\\"', '"')) try: - return json.loads(data)["detail"] + return util.json_loads(data)["detail"] except ValueError: - return json.loads(data.replace('\\"', '"'))["detail"] + return util.json_loads(data.replace('\\"', '"'))["detail"] class BcyUserExtractor(BcyExtractor): diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 6da6175..1469aad 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -9,8 +9,7 @@ """Extractors for https://www.behance.net/""" from .common import Extractor, Message -from .. import text -import json +from .. import text, util class BehanceExtractor(Extractor): @@ -120,7 +119,7 @@ class BehanceGalleryExtractor(BehanceExtractor): } page = self.request(url, cookies=cookies).text - data = json.loads(text.extr( + data = util.json_loads(text.extr( page, 'id="beconfig-store_state">', '</script>')) return self._update(data["project"]["project"]) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 8a1a42e..56010c2 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for Blogger blogs""" from .common import Extractor, Message -from .. import text -import json +from .. import text, util import re BASE_PATTERN = ( @@ -61,7 +60,7 @@ class BloggerExtractor(Extractor): page = self.request(post["url"]).text for url in findall_video(page): page = self.request(url).text - video_config = json.loads(text.extr( + video_config = util.json_loads(text.extr( page, 'var VIDEO_CONFIG =', '\n')) files.append(max( video_config["streams"], diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 1c339a9..17d066d 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,20 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.ru/""" +"""Extractors for https://bunkr.su/""" from .lolisafe import LolisafeAlbumExtractor from .. import text -import json class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.ru albums""" + """Extractor for bunkr.su albums""" category = "bunkr" - root = "https://bunkr.ru" - pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:ru|is|to)/a/([^/?#]+)" + root = "https://bunkr.su" + pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:[sr]u|is|to)/a/([^/?#]+)" test = ( - ("https://bunkr.ru/a/Lktg9Keq", { + ("https://bunkr.su/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -33,7 +32,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): }, }), # mp4 (#2239) - ("https://app.bunkr.is/a/ptRHaCn2", { + ("https://app.bunkr.ru/a/ptRHaCn2", { "pattern": r"https://media-files\.bunkr\.ru/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), @@ -41,44 +40,57 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): ("https://bunkr.is/a/iXTTc1o2", { "pattern": r"https://(cdn|media-files)4\.bunkr\.ru/", "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8", + "keyword": { + "album_id": "iXTTc1o2", + "album_name": "test2", + "album_size": "691.1 KB", + "count": 2, + "description": "072022", + "filename": "re:video-wFO9FtxG|image-sZrQUeOx", + "id": "re:wFO9FtxG|sZrQUeOx", + "name": "re:video|image", + "num": int, + }, }), ("https://bunkr.to/a/Lktg9Keq"), ) def fetch_album(self, album_id): - root = self.root + # album metadata + page = self.request(self.root + "/a/" + self.album_id).text + info = text.split_html(text.extr( + page, "<h1", "</div>").partition(">")[2]) + count, _, size = info[1].split(None, 2) + + # files + cdn = None + files = [] + append = files.append + headers = {"Referer": self.root.replace("://", "://stream.", 1) + "/"} - try: - data = json.loads(text.extr( - self.request(root + "/a/" + self.album_id).text, - 'id="__NEXT_DATA__" type="application/json">', '<')) - album = data["props"]["pageProps"]["album"] - files = album["files"] - except Exception as exc: - self.log.debug("%s: %s", exc.__class__.__name__, exc) - self.log.debug("Falling back to lolisafe API") - self.root = root.replace("://", "://app.", 1) - files, data = LolisafeAlbumExtractor.fetch_album(self, album_id) - # fix file URLs (bunkr..ru -> bunkr.ru) (#3481) - for file in files: - file["file"] = file["file"].replace("bunkr..", "bunkr.", 1) - else: - for file in files: - file["file"] = file["cdn"] + "/" + file["name"] - data = { - "album_id" : self.album_id, - "album_name" : text.unescape(album["name"]), - "description": text.unescape(album["description"]), - "count" : len(files), - } + pos = page.index('class="grid-images') + for url in text.extract_iter(page, '<a href="', '"', pos): + if url.startswith("/"): + if not cdn: + # fetch cdn root from download page + durl = "{}/d/{}".format(self.root, url[3:]) + cdn = text.extr(self.request( + durl).text, 'link.href = "', '"') + cdn = cdn[:cdn.index("/", 8)] + url = cdn + url[2:] - headers = {"Referer": root.replace("://", "://stream.", 1) + "/"} - for file in files: - if file["file"].endswith( - (".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", - ".zip", ".rar", ".7z")): - file["_http_headers"] = headers - file["file"] = file["file"].replace( - "://cdn", "://media-files", 1) + url = text.unescape(url) + if url.endswith((".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts", + ".zip", ".rar", ".7z")): + append({"file": url.replace("://cdn", "://media-files", 1), + "_http_headers": headers}) + else: + append({"file": url}) - return files, data + return files, { + "album_id" : self.album_id, + "album_name" : text.unescape(info[0]), + "album_size" : size[1:-1], + "description": text.unescape(info[2]) if len(info) > 2 else "", + "count" : len(files), + } diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py index 509108f..7a21d2a 100644 --- a/gallery_dl/extractor/catbox.py +++ b/gallery_dl/extractor/catbox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://catbox.moe/""" -from .common import GalleryExtractor +from .common import GalleryExtractor, Extractor, Message from .. import text @@ -54,3 +54,26 @@ class CatboxAlbumExtractor(GalleryExtractor): for path in text.extract_iter( page, ">https://files.catbox.moe/", "<") ] + + +class CatboxFileExtractor(Extractor): + """Extractor for catbox files""" + category = "catbox" + subcategory = "file" + archive_fmt = "{filename}" + pattern = r"(?:https?://)?(?:files|litter|de)\.catbox\.moe/([^/?#]+)" + test = ( + ("https://files.catbox.moe/8ih3y7.png", { + "pattern": r"^https://files\.catbox\.moe/8ih3y7\.png$", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + "count": 1, + }), + ("https://litter.catbox.moe/t8v3n9.png"), + ("https://de.catbox.moe/bjdmz1.jpg"), + ) + + def items(self): + url = text.ensure_http_scheme(self.url) + file = text.nameext_from_url(url, {"url": url}) + yield Message.Directory, file + yield Message.Url, url, file diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 4cefa1c..8024be9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -791,15 +791,21 @@ HTTP_HEADERS = { ("TE", "trailers"), ), "chrome": ( + ("Connection", "keep-alive"), ("Upgrade-Insecure-Requests", "1"), ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/92.0.4515.131 Safari/537.36"), + "like Gecko) Chrome/111.0.0.0 Safari/537.36"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/webp,image/apng,*/*;q=0.8"), + "image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7"), ("Referer", None), + ("Sec-Fetch-Site", "same-origin"), + ("Sec-Fetch-Mode", "no-cors"), + ("Sec-Fetch-Dest", "empty"), ("Accept-Encoding", None), ("Accept-Language", "en-US,en;q=0.9"), - ("Cookie", None), + ("cookie", None), + ("content-length", None), ), } @@ -838,8 +844,7 @@ SSL_CIPHERS = { "AES128-GCM-SHA256:" "AES256-GCM-SHA384:" "AES128-SHA:" - "AES256-SHA:" - "DES-CBC3-SHA" + "AES256-SHA" ), } diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 7b0e572..f104556 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -9,8 +9,7 @@ """Extractors for https://danbooru.donmai.us/ and other Danbooru instances""" from .common import BaseExtractor, Message -from ..version import __version__ -from .. import text +from .. import text, util import datetime @@ -21,36 +20,13 @@ class DanbooruExtractor(BaseExtractor): page_limit = 1000 page_start = None per_page = 200 + request_interval = 1.0 def __init__(self, match): - self._init_category(match) - - instance = INSTANCES.get(self.category) or {} - iget = instance.get - - self.headers = iget("headers") - self.page_limit = iget("page-limit", 1000) - self.page_start = iget("page-start") - self.per_page = iget("per-page", 200) - self.request_interval_min = iget("request-interval-min", 0.0) - self._pools = iget("pools") - self._popular_endpoint = iget("popular", "/explore/posts/popular.json") - BaseExtractor.__init__(self, match) - self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) - metadata = self.config("metadata", False) - if metadata: - if isinstance(metadata, (list, tuple)): - metadata = ",".join(metadata) - elif not isinstance(metadata, str): - metadata = "artist_commentary,children,notes,parent,uploader" - self.metadata_includes = metadata - else: - self.metadata_includes = None - threshold = self.config("threshold") if isinstance(threshold, int): self.threshold = 1 if threshold < 1 else threshold @@ -62,10 +38,6 @@ class DanbooruExtractor(BaseExtractor): self.log.debug("Using HTTP Basic Auth for user '%s'", username) self.session.auth = (username, api_key) - def request(self, url, **kwargs): - kwargs["headers"] = self.headers - return BaseExtractor.request(self, url, **kwargs) - def skip(self, num): pages = num // self.per_page if pages >= self.page_limit: @@ -74,32 +46,28 @@ class DanbooruExtractor(BaseExtractor): return pages * self.per_page def items(self): + self.session.headers["User-Agent"] = util.USERAGENT + + includes = self.config("metadata") + if includes: + if isinstance(includes, (list, tuple)): + includes = ",".join(includes) + elif not isinstance(includes, str): + includes = "artist_commentary,children,notes,parent,uploader" + data = self.metadata() for post in self.posts(): - file = post.get("file") - if file: - url = file["url"] - if not url: - md5 = file["md5"] - url = file["url"] = ( - "https://static1.{}/data/{}/{}/{}.{}".format( - self.root[8:], md5[0:2], md5[2:4], md5, file["ext"] - )) - post["filename"] = file["md5"] - post["extension"] = file["ext"] + try: + url = post["file_url"] + except KeyError: + if self.external and post["source"]: + post.update(data) + yield Message.Directory, post + yield Message.Queue, post["source"], post + continue - else: - try: - url = post["file_url"] - except KeyError: - if self.external and post["source"]: - post.update(data) - yield Message.Directory, post - yield Message.Queue, post["source"], post - continue - - text.nameext_from_url(url, post) + text.nameext_from_url(url, post) if post["extension"] == "zip": if self.ugoira: @@ -109,9 +77,9 @@ class DanbooruExtractor(BaseExtractor): url = post["large_file_url"] post["extension"] = "webm" - if self.metadata_includes: + if includes: meta_url = "{}/posts/{}.json?only={}".format( - self.root, post["id"], self.metadata_includes) + self.root, post["id"], includes) post.update(self.request(meta_url).json()) if url[0] == "/": @@ -127,7 +95,7 @@ class DanbooruExtractor(BaseExtractor): def posts(self): return () - def _pagination(self, endpoint, params, pagenum=False): + def _pagination(self, endpoint, params, pages=False): url = self.root + endpoint params["limit"] = self.per_page params["page"] = self.page_start @@ -141,7 +109,7 @@ class DanbooruExtractor(BaseExtractor): if len(posts) < self.threshold: return - if pagenum: + if pages: params["page"] += 1 else: for post in reversed(posts): @@ -163,34 +131,20 @@ class DanbooruExtractor(BaseExtractor): for index, delay in enumerate(delays)] -INSTANCES = { +BASE_PATTERN = DanbooruExtractor.update({ "danbooru": { "root": None, "pattern": r"(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us", }, - "e621": { - "root": None, - "pattern": r"e(?:621|926)\.net", - "headers": {"User-Agent": "gallery-dl/{} (by mikf)".format( - __version__)}, - "pools": "sort", - "popular": "/popular.json", - "page-limit": 750, - "per-page": 320, - "request-interval-min": 1.0, - }, "atfbooru": { "root": "https://booru.allthefallen.moe", "pattern": r"booru\.allthefallen\.moe", - "page-limit": 5000, }, "aibooru": { "root": None, "pattern": r"(?:safe.)?aibooru\.online", } -} - -BASE_PATTERN = DanbooruExtractor.update(INSTANCES) +}) class DanbooruTagExtractor(DanbooruExtractor): @@ -213,10 +167,6 @@ class DanbooruTagExtractor(DanbooruExtractor): "pattern": r"https://i\.pximg\.net/img-original/img" r"/2008/08/28/02/35/48/1476533_p0\.jpg", }), - ("https://e621.net/posts?tags=anry", { - "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba", - "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58", - }), ("https://booru.allthefallen.moe/posts?tags=yume_shokunin", { "count": 12, }), @@ -228,7 +178,6 @@ class DanbooruTagExtractor(DanbooruExtractor): ("https://hijiribe.donmai.us/posts?tags=bonocho"), ("https://sonohara.donmai.us/posts?tags=bonocho"), ("https://safebooru.donmai.us/posts?tags=bonocho"), - ("https://e926.net/posts?tags=anry"), ("https://safe.aibooru.online/posts?tags=center_frills"), ) @@ -254,23 +203,17 @@ class DanbooruPoolExtractor(DanbooruExtractor): ("https://danbooru.donmai.us/pools/7659", { "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99", }), - ("https://e621.net/pools/73", { - "url": "1bd09a72715286a79eea3b7f09f51b3493eb579a", - "content": "91abe5d5334425d9787811d7f06d34c77974cd22", - }), ("https://booru.allthefallen.moe/pools/9", { "url": "902549ffcdb00fe033c3f63e12bc3cb95c5fd8d5", "count": 6, }), ("https://aibooru.online/pools/1"), ("https://danbooru.donmai.us/pool/show/7659"), - ("https://e621.net/pool/show/73"), ) def __init__(self, match): DanbooruExtractor.__init__(self, match) self.pool_id = match.group(match.lastindex) - self.post_ids = () def metadata(self): url = "{}/pools/{}.json".format(self.root, self.pool_id) @@ -280,29 +223,8 @@ class DanbooruPoolExtractor(DanbooruExtractor): return {"pool": pool} def posts(self): - if self._pools == "sort": - self.log.info("Fetching posts of pool %s", self.pool_id) - - id_to_post = { - post["id"]: post - for post in self._pagination( - "/posts.json", {"tags": "pool:" + self.pool_id}) - } - - posts = [] - append = posts.append - for num, pid in enumerate(self.post_ids, 1): - if pid in id_to_post: - post = id_to_post[pid] - post["num"] = num - append(post) - else: - self.log.warning("Post %s is unavailable", pid) - return posts - - else: - params = {"tags": "pool:" + self.pool_id} - return self._pagination("/posts.json", params) + params = {"tags": "pool:" + self.pool_id} + return self._pagination("/posts.json", params) class DanbooruPostExtractor(DanbooruExtractor): @@ -318,10 +240,6 @@ class DanbooruPostExtractor(DanbooruExtractor): "pattern": r"https?://.+\.zip$", "options": (("ugoira", True),) }), - ("https://e621.net/posts/535", { - "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", - "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", - }), ("https://booru.allthefallen.moe/posts/22", { "content": "21dda68e1d7e0a554078e62923f537d8e895cac8", }), @@ -329,7 +247,6 @@ class DanbooruPostExtractor(DanbooruExtractor): "content": "54d548743cd67799a62c77cbae97cfa0fec1b7e9", }), ("https://danbooru.donmai.us/post/show/294929"), - ("https://e621.net/post/show/535"), ) def __init__(self, match): @@ -338,8 +255,7 @@ class DanbooruPostExtractor(DanbooruExtractor): def posts(self): url = "{}/posts/{}.json".format(self.root, self.post_id) - post = self.request(url).json() - return (post["post"] if "post" in post else post,) + return (self.request(url).json(),) class DanbooruPopularExtractor(DanbooruExtractor): @@ -355,12 +271,6 @@ class DanbooruPopularExtractor(DanbooruExtractor): "range": "1-120", "count": 120, }), - ("https://e621.net/popular"), - (("https://e621.net/explore/posts/popular" - "?date=2019-06-01&scale=month"), { - "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", - "count": ">= 70", - }), ("https://booru.allthefallen.moe/explore/posts/popular"), ("https://aibooru.online/explore/posts/popular"), ) @@ -385,31 +295,5 @@ class DanbooruPopularExtractor(DanbooruExtractor): def posts(self): if self.page_start is None: self.page_start = 1 - return self._pagination(self._popular_endpoint, self.params, True) - - -class DanbooruFavoriteExtractor(DanbooruExtractor): - """Extractor for e621 favorites""" - subcategory = "favorite" - directory_fmt = ("{category}", "Favorites", "{user_id}") - archive_fmt = "f_{user_id}_{id}" - pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" - test = ( - ("https://e621.net/favorites"), - ("https://e621.net/favorites?page=2&user_id=53275", { - "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", - "count": "> 260", - }), - ) - - def __init__(self, match): - DanbooruExtractor.__init__(self, match) - self.query = text.parse_query(match.group(match.lastindex)) - - def metadata(self): - return {"user_id": self.query.get("user_id", "")} - - def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination("/favorites.json", self.query, True) + return self._pagination( + "/explore/posts/popular.json", self.params, True) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index a3187fa..37475df 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.deviantart.com/""" +"""Extractors for https://www.deviantart.com/""" from .common import Extractor, Message from .. import text, util, exception @@ -21,29 +21,30 @@ import re BASE_PATTERN = ( r"(?:https?://)?(?:" - r"(?:www\.)?deviantart\.com/(?!watch/)([\w-]+)|" - r"(?!www\.)([\w-]+)\.deviantart\.com)" + r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|" + r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)" ) class DeviantartExtractor(Extractor): """Base class for deviantart extractors""" category = "deviantart" + root = "https://www.deviantart.com" directory_fmt = ("{category}", "{username}") filename_fmt = "{category}_{index}_{title}.{extension}" cookiedomain = None - root = "https://www.deviantart.com" + cookienames = ("auth", "auth_secure", "userinfo") _last_request = 0 def __init__(self, match): Extractor.__init__(self, match) - self.offset = 0 self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.original = self.config("original", True) self.comments = self.config("comments", False) self.user = match.group(1) or match.group(2) self.group = False + self.offset = 0 self.api = None unwatch = self.config("auto-unwatch") @@ -69,6 +70,14 @@ class DeviantartExtractor(Extractor): self.offset += num return num + def login(self): + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if not username: + return False + self._update_cookies(_login_impl(self, username, password)) + return True + def items(self): self.api = DeviantartOAuthAPI(self) @@ -87,6 +96,13 @@ class DeviantartExtractor(Extractor): yield Message.Queue, url, data continue + if deviation["is_deleted"]: + # prevent crashing in case the deviation really is + # deleted + self.log.debug( + "Skipping %s (deleted)", deviation["deviationid"]) + continue + if "premium_folder_data" in deviation: data = self._fetch_premium(deviation) if not data: @@ -346,9 +362,7 @@ class DeviantartExtractor(Extractor): kwargs["fatal"] = None diff = time.time() - DeviantartExtractor._last_request if diff < 2.0: - delay = 2.0 - diff - self.log.debug("Sleeping %.2f seconds", delay) - time.sleep(delay) + self.sleep(2.0 - diff, "request") while True: response = self.request(url, **kwargs) @@ -406,6 +420,16 @@ class DeviantartExtractor(Extractor): self.log.info("Unwatching %s", username) self.api.user_friends_unwatch(username) + def _eclipse_to_oauth(self, eclipse_api, deviations): + for obj in deviations: + deviation = obj["deviation"] if "deviation" in obj else obj + deviation_uuid = eclipse_api.deviation_extended_fetch( + deviation["deviationId"], + deviation["author"]["username"], + "journal" if deviation["isJournal"] else "art", + )["deviation"]["extended"]["deviationUuid"] + yield self.api.deviation(deviation_uuid) + class DeviantartUserExtractor(DeviantartExtractor): """Extractor for an artist's user profile""" @@ -676,15 +700,9 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): ) def deviations(self): - folders = self.api.collections_folders(self.user) if self.flat: - deviations = itertools.chain.from_iterable( - self.api.collections(self.user, folder["folderid"]) - for folder in folders - ) - if self.offset: - deviations = util.advance(deviations, self.offset) - return deviations + return self.api.collections_all(self.user, self.offset) + folders = self.api.collections_folders(self.user) return self._folder_urls( folders, "favourites", DeviantartCollectionExtractor) @@ -796,6 +814,14 @@ class DeviantartStatusExtractor(DeviantartExtractor): "url" : "re:^https://sta.sh", }, }), + # "deleted" deviations in 'items' + ("https://www.deviantart.com/AndrejSKalin/posts/statuses", { + "options": (("journals", "none"), ("original", 0), + ("image-filter", "deviationid[:8] == '147C8B03'")), + "count": 2, + "archive": False, + "keyword": {"deviationid": "147C8B03-7D34-AE93-9241-FA3C6DBBC655"} + }), ("https://www.deviantart.com/justgalym/posts/statuses", { "options": (("journals", "text"),), "url": "c8744f7f733a3029116607b826321233c5ca452d", @@ -861,8 +887,7 @@ class DeviantartPopularExtractor(DeviantartExtractor): "{popular[range]}", "{popular[search]}") archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" pattern = (r"(?:https?://)?www\.deviantart\.com/(?:" - r"search(?:/deviations)?" - r"|(?:deviations/?)?\?order=(popular-[^/?#]+)" + r"(?:deviations/?)?\?order=(popular-[^/?#]+)" r"|((?:[\w-]+/)*)(popular-[^/?#]+)" r")/?(?:\?([^#]*))?") test = ( @@ -876,8 +901,6 @@ class DeviantartPopularExtractor(DeviantartExtractor): "range": "1-30", "count": 30, }), - ("https://www.deviantart.com/search?q=tree"), - ("https://www.deviantart.com/search/deviations?order=popular-1-week"), ("https://www.deviantart.com/artisan/popular-all-time/?q=tree"), ) @@ -974,7 +997,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor): subcategory = "deviation" archive_fmt = "g_{_username}_{index}.{extension}" pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" - r"|(?:https?://)?(?:www\.)?deviantart\.com/" + r"|(?:https?://)?(?:www\.)?(?:fx)?deviantart\.com/" r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)" r"(\d+)" # bare deviation ID without slug r"|(?:https?://)?fav\.me/d([0-9a-z]+)") # base36 @@ -1068,6 +1091,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor): # old /view/ URLs from the Wayback Machine ("https://www.deviantart.com/view.php?id=14864502"), ("http://www.deviantart.com/view-full.php?id=100842"), + + ("https://www.fxdeviantart.com/zzz/art/zzz-1234567890"), + ("https://www.fxdeviantart.com/view/1234567890"), ) skip = Extractor.skip @@ -1094,6 +1120,7 @@ class DeviantartScrapsExtractor(DeviantartExtractor): subcategory = "scraps" directory_fmt = ("{category}", "{username}", "Scraps") archive_fmt = "s_{_username}_{index}.{extension}" + cookiedomain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" test = ( ("https://www.deviantart.com/shimoda7/gallery/scraps", { @@ -1102,34 +1129,109 @@ class DeviantartScrapsExtractor(DeviantartExtractor): ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps"), ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), ) + + def deviations(self): + self.login() + + eclipse_api = DeviantartEclipseAPI(self) + return self._eclipse_to_oauth( + eclipse_api, eclipse_api.gallery_scraps(self.user, self.offset)) + + +class DeviantartSearchExtractor(DeviantartExtractor): + """Extractor for deviantart search results""" + subcategory = "search" + directory_fmt = ("{category}", "Search", "{search_tags}") + archive_fmt = "Q_{search_tags}_{index}.{extension}" cookiedomain = ".deviantart.com" - cookienames = ("auth", "auth_secure", "userinfo") - _warning = True + pattern = (r"(?:https?://)?www\.deviantart\.com" + r"/search(?:/deviations)?/?\?([^#]+)") + test = ( + ("https://www.deviantart.com/search?q=tree"), + ("https://www.deviantart.com/search/deviations?order=popular-1-week"), + ) + + skip = Extractor.skip + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.query = text.parse_query(self.user) + self.search = self.query.get("q", "") + self.user = "" + + def deviations(self): + logged_in = self.login() + + eclipse_api = DeviantartEclipseAPI(self) + search = (eclipse_api.search_deviations + if logged_in else self._search_html) + return self._eclipse_to_oauth(eclipse_api, search(self.query)) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["search_tags"] = self.search + + def _search_html(self, params): + url = self.root + "/search" + deviation = { + "deviationId": None, + "author": {"username": "u"}, + "isJournal": False, + } + + while True: + page = self.request(url, params=params).text + + items , pos = text.rextract(page, r'\"items\":[', ']') + cursor, pos = text.extract(page, r'\"cursor\":\"', '\\', pos) + + for deviation_id in items.split(","): + deviation["deviationId"] = deviation_id + yield deviation + + if not cursor: + return + params["cursor"] = cursor + + +class DeviantartGallerySearchExtractor(DeviantartExtractor): + """Extractor for deviantart gallery searches""" + subcategory = "gallery-search" + archive_fmt = "g_{_username}_{index}.{extension}" + cookiedomain = ".deviantart.com" + pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)" + test = ( + ("https://www.deviantart.com/shimoda7/gallery?q=memory", { + "options": (("original", 0),), + "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", + }), + ("https://www.deviantart.com/shimoda7/gallery?q=memory&sort=popular"), + ) + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.query = match.group(3) def deviations(self): self.login() eclipse_api = DeviantartEclipseAPI(self) - for obj in eclipse_api.gallery_scraps(self.user, self.offset): - deviation = obj["deviation"] - deviation_uuid = eclipse_api.deviation_extended_fetch( - deviation["deviationId"], - deviation["author"]["username"], - "journal" if deviation["isJournal"] else "art", - )["deviation"]["extended"]["deviationUuid"] + info = eclipse_api.user_info(self.user) - yield self.api.deviation(deviation_uuid) + query = text.parse_query(self.query) + self.search = query["q"] - def login(self): - """Login and obtain session cookies""" - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(_login_impl(self, username, password)) - elif self._warning: - self.log.warning( - "No session cookies set: Unable to fetch mature scraps.") - DeviantartScrapsExtractor._warning = False + return self._eclipse_to_oauth( + eclipse_api, eclipse_api.galleries_search( + info["user"]["userId"], + self.search, + self.offset, + query.get("sort", "most-recent"), + )) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["search_tags"] = self.search class DeviantartFollowingExtractor(DeviantartExtractor): @@ -1261,6 +1363,13 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination(endpoint, params) + def collections_all(self, username, offset=0): + """Yield all deviations in a user's collection""" + endpoint = "/collections/all" + params = {"username": username, "offset": offset, "limit": 24, + "mature_content": self.mature} + return self._pagination(endpoint, params) + @memcache(keyarg=1) def collections_folders(self, username, offset=0): """Yield all collection folders of a specific user""" @@ -1411,7 +1520,7 @@ class DeviantartOAuthAPI(): while True: if self.delay: - time.sleep(self.delay) + self.extractor.sleep(self.delay, "api") self.authenticate(None if public else self.refresh_token_key) kwargs["headers"] = self.headers @@ -1480,6 +1589,15 @@ class DeviantartOAuthAPI(): self._metadata(results) if self.folders: self._folders(results) + else: # attempt to fix "deleted" deviations + for dev in self._shared_content(results): + if not dev["is_deleted"]: + continue + patch = self._call( + "/deviation/" + dev["deviationid"], fatal=False) + if patch: + dev.update(patch) + yield from results if not data["has_more"] and ( @@ -1497,6 +1615,14 @@ class DeviantartOAuthAPI(): return params["offset"] = int(params["offset"]) + len(results) + @staticmethod + def _shared_content(results): + """Return an iterable of shared deviations in 'results'""" + for result in results: + for item in result.get("items") or (): + if "deviation" in item: + yield item["deviation"] + def _pagination_list(self, endpoint, params, key="results"): result = [] result.extend(self._pagination(endpoint, params, False, key=key)) @@ -1585,6 +1711,29 @@ class DeviantartEclipseAPI(): } return self._pagination(endpoint, params) + def galleries_search(self, user_id, query, + offset=None, order="most-recent"): + endpoint = "/shared_api/galleries/search" + params = { + "userid": user_id, + "order" : order, + "q" : query, + "offset": offset, + "limit" : 24, + } + return self._pagination(endpoint, params) + + def search_deviations(self, params): + endpoint = "/da-browse/api/networkbar/search/deviations" + return self._pagination(endpoint, params, key="deviations") + + def user_info(self, user, expand=False): + endpoint = "/shared_api/user/info" + params = {"username": user} + if expand: + params["expand"] = "user.stats,user.profile,user.watch" + return self._call(endpoint, params) + def user_watching(self, user, offset=None): endpoint = "/da-user-profile/api/module/watching" params = { @@ -1611,23 +1760,37 @@ class DeviantartEclipseAPI(): except Exception: return {"error": response.text} - def _pagination(self, endpoint, params): + def _pagination(self, endpoint, params, key="results"): + limit = params.get("limit", 24) + warn = True + while True: data = self._call(endpoint, params) - results = data.get("results") + results = data.get(key) if results is None: return + if len(results) < limit and warn and data.get("hasMore"): + warn = False + self.log.warning( + "Private deviations detected! " + "Provide login credentials or session cookies " + "to be able to access them.") yield from results if not data.get("hasMore"): return - next_offset = data.get("nextOffset") - if next_offset: - params["offset"] = next_offset + if "nextCursor" in data: + params["offset"] = None + params["cursor"] = data["nextCursor"] + elif "nextOffset" in data: + params["offset"] = data["nextOffset"] + params["cursor"] = None + elif params.get("offset") is None: + return else: - params["offset"] += params["limit"] + params["offset"] = int(params["offset"]) + len(results) def _module_id_watching(self, user): url = "{}/{}/about".format(self.extractor.root, user) diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 8b90250..e85eb8d 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -44,6 +44,11 @@ class DirectlinkExtractor(Extractor): ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw" ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP" "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"), + # internationalized domain name + ("https://räksmörgås.josefsson.org/raksmorgas.jpg", { + "url": "a65667f670b194afbd1e3ea5e7a78938d36747da", + "keyword": "fd5037fe86eebd4764e176cbaf318caec0f700be", + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index d78f25b..59e8c90 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://dynasty-scans.com/""" from .common import ChapterExtractor, MangaExtractor, Extractor, Message -from .. import text -import json +from .. import text, util import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -86,7 +85,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): data = text.extr(page, "var pages = ", ";\n") return [ (self.root + img["image"], None) - for img in json.loads(data) + for img in util.json_loads(data) ] diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py new file mode 100644 index 0000000..8f2994e --- /dev/null +++ b/gallery_dl/extractor/e621.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://e621.net/ and other e621 instances""" + +from .common import Message +from . import danbooru +from .. import text, util + + +class E621Extractor(danbooru.DanbooruExtractor): + """Base class for e621 extractors""" + basecategory = "E621" + page_limit = 750 + page_start = None + per_page = 320 + request_interval_min = 1.0 + + def items(self): + self.session.headers["User-Agent"] = util.USERAGENT + " (by mikf)" + + includes = self.config("metadata") or () + if includes: + if isinstance(includes, str): + includes = includes.split(",") + elif not isinstance(includes, (list, tuple)): + includes = ("notes", "pools") + + notes = ("notes" in includes) + pools = ("pools" in includes) + + data = self.metadata() + for post in self.posts(): + file = post["file"] + + if not file["url"]: + md5 = file["md5"] + file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format( + self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]) + + if notes and post.get("has_notes"): + url = "{}/notes.json?search[post_id]={}".format( + self.root, post["id"]) + post["notes"] = self.request(url).json() + + if pools and post["pools"]: + url = "{}/pools.json?search[id]={}".format( + self.root, ",".join(map(str, post["pools"]))) + post["pools"] = _pools = self.request(url).json() + for pool in _pools: + pool["name"] = pool["name"].replace("_", " ") + + post["filename"] = file["md5"] + post["extension"] = file["ext"] + + post.update(data) + yield Message.Directory, post + yield Message.Url, file["url"], post + + +BASE_PATTERN = E621Extractor.update({ + "e621": { + "root": "https://e621.net", + "pattern": r"e621\.net", + }, + "e926": { + "root": "https://e926.net", + "pattern": r"e926\.net", + }, +}) + + +class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor): + """Extractor for e621 posts from tag searches""" + pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)" + test = ( + ("https://e621.net/posts?tags=anry", { + "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba", + "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58", + }), + ("https://e621.net/post/index/1/anry"), + ("https://e621.net/post?tags=anry"), + + ("https://e926.net/posts?tags=anry", { + "url": "12198b275c62ffe2de67cca676c8e64de80c425d", + "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58", + }), + ("https://e926.net/post/index/1/anry"), + ("https://e926.net/post?tags=anry"), + ) + + +class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): + """Extractor for e621 pools""" + pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)" + test = ( + ("https://e621.net/pools/73", { + "url": "1bd09a72715286a79eea3b7f09f51b3493eb579a", + "content": "91abe5d5334425d9787811d7f06d34c77974cd22", + }), + ("https://e621.net/pool/show/73"), + + ("https://e926.net/pools/73", { + "url": "6936f1b6a18c5c25bee7cad700088dbc2503481b", + "content": "91abe5d5334425d9787811d7f06d34c77974cd22", + }), + ("https://e926.net/pool/show/73"), + ) + + def posts(self): + self.log.info("Fetching posts of pool %s", self.pool_id) + + id_to_post = { + post["id"]: post + for post in self._pagination( + "/posts.json", {"tags": "pool:" + self.pool_id}) + } + + posts = [] + append = posts.append + for num, pid in enumerate(self.post_ids, 1): + if pid in id_to_post: + post = id_to_post[pid] + post["num"] = num + append(post) + else: + self.log.warning("Post %s is unavailable", pid) + return posts + + +class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): + """Extractor for single e621 posts""" + pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)" + test = ( + ("https://e621.net/posts/535", { + "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", + "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", + }), + ("https://e621.net/posts/3181052", { + "options": (("metadata", "notes,pools"),), + "pattern": r"https://static\d\.e621\.net/data/c6/8c" + r"/c68cca0643890b615f75fb2719589bff\.png", + "keyword": { + "notes": [ + { + "body": "Little Legends 2", + "created_at": "2022-05-16T13:58:38.877-04:00", + "creator_id": 517450, + "creator_name": "EeveeCuddler69", + "height": 475, + "id": 321296, + "is_active": True, + "post_id": 3181052, + "updated_at": "2022-05-16T13:59:02.050-04:00", + "version": 3, + "width": 809, + "x": 83, + "y": 117, + }, + ], + "pools": [ + { + "category": "series", + "created_at": "2022-02-17T00:29:22.669-05:00", + "creator_id": 1077440, + "creator_name": "Yeetus90", + "description": "* \"Little Legends\":/pools/27971\r\n" + "* Little Legends 2\r\n" + "* \"Little Legends 3\":/pools/27481", + "id": 27492, + "is_active": False, + "name": "Little Legends 2", + "post_count": 39, + "post_ids": list, + "updated_at": "2022-03-27T06:30:03.382-04:00" + }, + ], + }, + }), + ("https://e621.net/post/show/535"), + + ("https://e926.net/posts/535", { + "url": "17aec8ebd8fab098d321adcb62a2db59dab1f4bf", + "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", + }), + ("https://e926.net/post/show/535"), + ) + + def posts(self): + url = "{}/posts/{}.json".format(self.root, self.post_id) + return (self.request(url).json()["post"],) + + +class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): + """Extractor for popular images from e621""" + pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" + test = ( + ("https://e621.net/explore/posts/popular"), + (("https://e621.net/explore/posts/popular" + "?date=2019-06-01&scale=month"), { + "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", + "count": ">= 70", + }), + + ("https://e926.net/explore/posts/popular"), + (("https://e926.net/explore/posts/popular" + "?date=2019-06-01&scale=month"), { + "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+", + "count": ">= 70", + }), + ) + + def posts(self): + if self.page_start is None: + self.page_start = 1 + return self._pagination("/popular.json", self.params, True) + + +class E621FavoriteExtractor(E621Extractor): + """Extractor for e621 favorites""" + subcategory = "favorite" + directory_fmt = ("{category}", "Favorites", "{user_id}") + archive_fmt = "f_{user_id}_{id}" + pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + test = ( + ("https://e621.net/favorites"), + ("https://e621.net/favorites?page=2&user_id=53275", { + "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", + "count": "> 260", + }), + + ("https://e926.net/favorites"), + ("https://e926.net/favorites?page=2&user_id=53275", { + "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+", + "count": "> 260", + }), + ) + + def __init__(self, match): + E621Extractor.__init__(self, match) + self.query = text.parse_query(match.group(match.lastindex)) + + def metadata(self): + return {"user_id": self.query.get("user_id", "")} + + def posts(self): + if self.page_start is None: + self.page_start = 1 + return self._pagination("/favorites.json", self.query, True) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index ad3f16b..03307f8 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache import itertools -import time BASE_PATTERN = r"(?:https?://)?(?:www\.)?erome\.com" @@ -75,7 +74,7 @@ class EromeExtractor(Extractor): if response.content.find( b"<title>Please wait a few moments</title>", 0, 600) < 0: return response - time.sleep(5) + self.sleep(5.0, "check") def _pagination(self, url, params): for params["page"] in itertools.count(1): diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index 57587b6..0503dcf 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -6,11 +6,10 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters from https://www.fascans.com/""" +"""Extractors for https://www.fascans.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text, util -import json class FallenangelsChapterExtractor(ChapterExtractor): @@ -56,7 +55,7 @@ class FallenangelsChapterExtractor(ChapterExtractor): def images(page): return [ (img["page_image"], None) - for img in json.loads( + for img in util.json_loads( text.extr(page, "var pages = ", ";") ) ] diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 41431dc..57c4333 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -89,6 +89,7 @@ class FanboxExtractor(Extractor): content_body["imageMap"] = { image_id: image_map[image_id] for image_id in images + if image_id in image_map } post["content"] = "\n".join(content) @@ -256,7 +257,6 @@ class FanboxCreatorExtractor(FanboxExtractor): def posts(self): url = "https://api.fanbox.cc/post.listCreator?creatorId={}&limit=10" - return self._pagination(url.format(self.creator_id)) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 476fdeb..13dfead 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -7,8 +7,7 @@ """Extractors for https://fantia.jp/""" from .common import Extractor, Message -from .. import text -import json +from .. import text, util class FantiaExtractor(Extractor): @@ -117,7 +116,7 @@ class FantiaExtractor(Extractor): yield self.root+"/"+content["download_uri"], post if content["category"] == "blog" and "comment" in content: - comment_json = json.loads(content["comment"]) + comment_json = util.json_loads(content["comment"]) ops = comment_json.get("ops", ()) # collect blogpost text first diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 2290cc2..4a38fb4 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import BaseExtractor, Message from .. import text, util -import json class FoolslideExtractor(BaseExtractor): @@ -106,7 +105,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): }) def images(self, page): - return json.loads(text.extr(page, "var pages = ", ";")) + return util.json_loads(text.extr(page, "var pages = ", ";")) class FoolslideMangaExtractor(FoolslideExtractor): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 8d73949..80b0ae1 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -13,6 +13,8 @@ from . import gelbooru_v02 from .. import text, exception import binascii +BASE_PATTERN = r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?" + class GelbooruBase(): """Base class for gelbooru extractors""" @@ -53,6 +55,23 @@ class GelbooruBase(): del params["pid"] params["tags"] = "{} id:<{}".format(self.tags, post["id"]) + def _pagination_html(self, params): + url = self.root + "/index.php" + params["pid"] = self.page_start * self.per_page + + data = {} + while True: + num_ids = 0 + page = self.request(url, params=params).text + + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + yield from self._api_request(data) + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + @staticmethod def _file_url(post): url = post["file_url"] @@ -88,8 +107,7 @@ class GelbooruBase(): class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): """Extractor for images from gelbooru.com based on search-tags""" - pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" - r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") + pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)" test = ( ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", { "count": 5, @@ -108,8 +126,7 @@ class GelbooruPoolExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PoolExtractor): """Extractor for gelbooru pools""" per_page = 45 - pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" - r"\?page=pool&s=show&id=(?P<pool>\d+)") + pattern = BASE_PATTERN + r"page=pool&s=show&id=(\d+)" test = ( ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { "count": 6, @@ -124,9 +141,9 @@ class GelbooruPoolExtractor(GelbooruBase, "id" : self.pool_id, "pid" : self.page_start, } - self._page = self.request(url, params=self._params).text + page = self.request(url, params=self._params).text - name, pos = text.extract(self._page, "<h3>Now Viewing: ", "</h3>") + name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>") if not name: raise exception.NotFoundError("pool") @@ -136,29 +153,19 @@ class GelbooruPoolExtractor(GelbooruBase, } def posts(self): - url = self.root + "/index.php" - params = self._params + return self._pagination_html(self._params) - page = self._page - del self._page - data = {} - - while True: - num_ids = 0 - for data["id"] in text.extract_iter(page, '" id="p', '"'): - num_ids += 1 - yield from self._api_request(data) - if num_ids < self.per_page: - return - params["pid"] += self.per_page - page = self.request(url, params=params).text +class GelbooruFavoriteExtractor(GelbooruBase, + gelbooru_v02.GelbooruV02FavoriteExtractor): + pattern = BASE_PATTERN + r"page=favorites&s=view&id=(\d+)" + test = ("https://gelbooru.com/index.php?page=favorites&s=view&id=12345",) class GelbooruPostExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PostExtractor): """Extractor for single images from gelbooru.com""" - pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?" + pattern = (BASE_PATTERN + r"(?=(?:[^#]+&)?page=post(?:&|#|$))" r"(?=(?:[^#]+&)?s=view(?:&|#|$))" r"(?:[^#]+&)?id=(\d+)") diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 9292da3..9999283 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -32,6 +32,28 @@ class GenericExtractor(Extractor): (?:\#(?P<fragment>.*))? # optional fragment """ + test = ( + ("generic:https://www.nongnu.org/lzip/", { + "count": 1, + "content": "40be5c77773d3e91db6e1c5df720ee30afb62368", + "keyword": { + "description": "Lossless data compressor", + "imageurl": "https://www.nongnu.org/lzip/lzip.png", + "keywords": "lzip, clzip, plzip, lzlib, LZMA, bzip2, " + "gzip, data compression, GNU, free software", + "pageurl": "https://www.nongnu.org/lzip/", + }, + }), + # internationalized domain name + ("generic:https://räksmörgås.josefsson.org/", { + "count": 2, + "pattern": "^https://räksmörgås.josefsson.org/", + }), + ("generic:https://en.wikipedia.org/Main_Page"), + ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"), + ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), + ) + def __init__(self, match): """Init.""" Extractor.__init__(self, match) @@ -56,7 +78,7 @@ class GenericExtractor(Extractor): self.root = self.scheme + match.group('domain') def items(self): - """Get page, extract metadata & images, yield them in suitable messages. + """Get page, extract metadata & images, yield them in suitable messages Adapted from common.GalleryExtractor.items() diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index 43479c6..5b561ea 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.hbrowse.com/""" +"""Extractors for https://www.hbrowse.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, exception -import json +from .. import text, util, exception class HbrowseBase(): @@ -68,7 +67,7 @@ class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): def images(self, page): base = self.root + "/data" + self.path json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" - return [(base + name, None) for name in json.loads(json_data)] + return [(base + name, None) for name in util.json_loads(json_data)] class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index dc4e31d..e771a4f 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -9,8 +9,7 @@ """Extractors for https://hentai2read.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text -import json +from .. import text, util import re @@ -78,7 +77,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): images = text.extract(page, "'images' : ", ",\n")[0] return [ ("https://hentaicdn.com/hentai" + part, None) - for part in json.loads(images) + for part in util.json_loads(images) ] diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index 0327f56..ed8576f 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://hentaifox.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text -import json +from .. import text, util class HentaifoxBase(): @@ -90,7 +89,7 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): server1 = "https://i.hentaifox.com" server2 = "https://i2.hentaifox.com" - for num, image in json.loads(data).items(): + for num, image in util.json_loads(data).items(): ext, width, height = image.split(",") path = urlfmt(num, extmap[ext]) append((server1 + path, { diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py index bf9e464..0617330 100644 --- a/gallery_dl/extractor/hentaihand.py +++ b/gallery_dl/extractor/hentaihand.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, util -import json class HentaihandGalleryExtractor(GalleryExtractor): @@ -46,7 +45,7 @@ class HentaihandGalleryExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - info = json.loads(page) + info = util.json_loads(page) data = { "gallery_id" : text.parse_int(info["id"]), "title" : info["title"], diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index 38ec77c..2297cc0 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://hentaihere.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text -import json +from .. import text, util import re @@ -80,7 +79,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): images = text.extr(page, "var rff_imageList = ", ";") return [ ("https://hentaicdn.com/hentai" + part, None) - for part in json.loads(images) + for part in util.json_loads(images) ] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 44459ce..4e8d1ca 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,7 +13,6 @@ from .nozomi import decode_nozomi from ..cache import memcache from .. import text, util import string -import json import re @@ -75,7 +74,7 @@ class HitomiGalleryExtractor(GalleryExtractor): self.root, gid) def metadata(self, page): - self.info = info = json.loads(page.partition("=")[2]) + self.info = info = util.json_loads(page.partition("=")[2]) iget = info.get language = iget("language") diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 1efbbf0..497f1ef 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -9,8 +9,7 @@ """Extractors for https://www.imagefap.com/""" from .common import Extractor, Message -from .. import text, exception -import json +from .. import text, util, exception BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com" @@ -47,7 +46,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor): pattern = BASE_PATTERN + r"/(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)" test = ( - ("https://www.imagefap.com/pictures/7102714", { + ("https://www.imagefap.com/gallery/7102714", { "pattern": r"https://cdnh?\.imagefap\.com" r"/images/full/\d+/\d+/\d+\.jpg", "keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3", @@ -68,6 +67,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor): }, "count": 44, }), + ("https://www.imagefap.com/pictures/7102714"), ("https://www.imagefap.com/gallery.php?gid=7102714"), ("https://beta.imagefap.com/gallery.php?gid=7102714"), ) @@ -78,7 +78,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor): self.image_id = "" def items(self): - url = "{}/pictures/{}/".format(self.root, self.gid) + url = "{}/gallery/{}".format(self.root, self.gid) page = self.request(url).text data = self.get_job_metadata(page) yield Message.Directory, data @@ -88,22 +88,21 @@ class ImagefapGalleryExtractor(ImagefapExtractor): def get_job_metadata(self, page): """Collect metadata for extractor-job""" - descr, pos = text.extract( - page, '<meta name="description" content="Browse ', '"') - count, pos = text.extract(page, ' 1 of ', ' pics"', pos) - self.image_id = text.extract(page, 'id="img_ed_', '"', pos)[0] - - title, _, descr = descr.partition(" porn picture gallery by ") - uploader, _, tags = descr.partition(" to see hottest ") - self._count = text.parse_int(count) - return { + extr = text.extract_from(page) + + data = { "gallery_id": text.parse_int(self.gid), - "title": text.unescape(title), - "uploader": uploader, - "tags": tags[:-11].split(", "), - "count": self._count, + "tags": extr('name="keywords" content="', '"').split(", "), + "uploader": extr("porn picture gallery by ", " to see hottest"), + "title": text.unescape(extr("<title>", "<")), + "count": text.parse_int(extr(' 1 of ', ' pics"')), } + self.image_id = extr('id="img_ed_', '"') + self._count = data["count"] + + return data + def get_images(self): """Collect image-urls and -metadata""" url = "{}/photo/{}/".format(self.root, self.image_id) @@ -128,7 +127,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor): data["image_id"] = text.parse_int(data["filename"]) yield image_url, data - if cnt < 24 and num >= total: + if not cnt or cnt < 24 and num >= total: return params["idx"] += cnt @@ -173,7 +172,7 @@ class ImagefapImageExtractor(ImagefapExtractor): page, 'id="imageid_input" value="', '"', pos) gallery_id, pos = text.extract( page, 'id="galleryid_input" value="', '"', pos) - info = json.loads(info) + info = util.json_loads(info) url = info["contentUrl"] return url, text.nameext_from_url(url, { diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 207562a..d57ec89 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -187,12 +187,19 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): class ImagetwistImageExtractor(ImagehostImageExtractor): """Extractor for single images from imagetwist.com""" category = "imagetwist" - pattern = r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))" - test = ("https://imagetwist.com/f1i2s4vhvbrq/test.png", { - "url": "8d5e168c0bee30211f821c6f3b2116e419d42671", - "keyword": "d1060a4c2e3b73b83044e20681712c0ffdd6cfef", - "content": "0c8768055e4e20e7c7259608b67799171b691140", - }) + pattern = (r"(?:https?://)?((?:www\.|phun\.)?" + r"image(?:twist|haha)\.com/([a-z0-9]{12}))") + test = ( + ("https://imagetwist.com/f1i2s4vhvbrq/test.png", { + "url": "8d5e168c0bee30211f821c6f3b2116e419d42671", + "keyword": "d1060a4c2e3b73b83044e20681712c0ffdd6cfef", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("https://www.imagetwist.com/f1i2s4vhvbrq/test.png"), + ("https://phun.imagetwist.com/f1i2s4vhvbrq/test.png"), + ("https://imagehaha.com/f1i2s4vhvbrq/test.png"), + ("https://www.imagehaha.com/f1i2s4vhvbrq/test.png"), + ) @property @memcache(maxage=3*3600) diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 49082d8..a221075 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -9,9 +9,8 @@ """Extractors for https://imgbb.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache -import json class ImgbbExtractor(Extractor): @@ -98,7 +97,7 @@ class ImgbbExtractor(Extractor): while True: for img in text.extract_iter(page, "data-object='", "'"): - yield json.loads(text.unquote(img)) + yield util.json_loads(text.unquote(img)) if data: if params["seek"] == data["seekEnd"]: return diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index deb31a0..4c1be0f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -40,6 +40,7 @@ class InstagramExtractor(Extractor): self._logged_in = True self._find_tags = re.compile(r"#\w+").findall self._cursor = None + self._user = None def items(self): self.login() @@ -60,6 +61,8 @@ class InstagramExtractor(Extractor): post = self._parse_post_graphql(post) else: post = self._parse_post_rest(post) + if self._user: + post["user"] = self._user post.update(data) files = post.pop("_files") @@ -363,6 +366,22 @@ class InstagramExtractor(Extractor): self._cursor = cursor return cursor + def _assign_user(self, user): + self._user = user + + for key, old in ( + ("count_media" , "edge_owner_to_timeline_media"), + ("count_video" , "edge_felix_video_timeline"), + ("count_saved" , "edge_saved_media"), + ("count_mutual" , "edge_mutual_followed_by"), + ("count_follow" , "edge_follow"), + ("count_followed" , "edge_followed_by"), + ("count_collection", "edge_media_collections")): + try: + user[key] = user.pop(old)["count"] + except Exception: + user[key] = 0 + class InstagramUserExtractor(InstagramExtractor): """Extractor for an Instagram user profile""" @@ -796,6 +815,7 @@ class InstagramRestAPI(): name = user["username"] s = "" if name.endswith("s") else "s" raise exception.StopExtraction("%s'%s posts are private", name, s) + self.extractor._assign_user(user) return user["id"] def user_clips(self, user_id): diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index 8067f63..c0a1de1 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://issuu.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text -import json +from .. import text, util class IssuuBase(): @@ -54,7 +53,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): }) def metadata(self, page): - data = json.loads(text.extr( + data = util.json_loads(text.extr( page, '<script data-json="', '"').replace(""", '"')) doc = data["initialDocumentData"]["document"] diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py index d202e20..783473d 100644 --- a/gallery_dl/extractor/lightroom.py +++ b/gallery_dl/extractor/lightroom.py @@ -7,8 +7,7 @@ """Extractors for https://lightroom.adobe.com/""" from .common import Extractor, Message -from .. import text -import json +from .. import text, util class LightroomGalleryExtractor(Extractor): @@ -46,7 +45,7 @@ class LightroomGalleryExtractor(Extractor): # Get config url = "https://lightroom.adobe.com/shares/" + self.href response = self.request(url) - album = json.loads( + album = util.json_loads( text.extr(response.text, "albumAttributes: ", "\n") ) @@ -75,7 +74,7 @@ class LightroomGalleryExtractor(Extractor): url = base_url + next_url page = self.request(url).text # skip 1st line as it's a JS loop - data = json.loads(page[page.index("\n") + 1:]) + data = util.json_loads(page[page.index("\n") + 1:]) base_url = data["base"] for res in data["resources"]: diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index dae203e..409483b 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache -from ..version import __version__ from collections import defaultdict BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|cc)" @@ -28,10 +27,10 @@ class MangadexExtractor(Extractor): archive_fmt = "{chapter_id}_{page}" root = "https://mangadex.org" _cache = {} - _headers = {"User-Agent": "gallery-dl/" + __version__} def __init__(self, match): Extractor.__init__(self, match) + self.session.headers["User-Agent"] = util.USERAGENT self.api = MangadexAPI(self) self.uuid = match.group(1) @@ -127,7 +126,6 @@ class MangadexChapterExtractor(MangadexExtractor): data["chapter"], data["chapter_minor"], data["_external_url"]) yield Message.Directory, data - data["_http_headers"] = self._headers server = self.api.athome_server(self.uuid) chapter = server["chapter"] @@ -192,7 +190,7 @@ class MangadexAPI(): def __init__(self, extr): self.extractor = extr - self.headers = extr._headers.copy() + self.headers = {} self.username, self.password = self.extractor._get_auth_info() if not self.username: diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 14a542b..5ba18a3 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -10,51 +10,33 @@ from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = \ - r"(?:https?://)?((?:(?:chap|read)?manganato|(?:www\.)?manganelo)\.com)" +BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" -class ManganeloChapterExtractor(ChapterExtractor): - """Extractor for manga-chapters from manganelo.com""" +class ManganeloBase(): category = "manganelo" root = "https://chapmanganato.com" - pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)" - test = ( - ("https://chapmanganato.com/manga-gn983696/chapter-23", { - "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/03/23" - r"/39/gn983696/vol_3_chapter_23_24_yen/\d+-[no]\.jpg", - "keyword": "2c5cd59342f149375df9bcb50aa416b4d04a43cf", - "count": 25, - }), - ("https://readmanganato.com/manga-gn983696/chapter-23"), - ("https://manganelo.com/chapter/gamers/chapter_15"), - ("https://manganelo.com/chapter/gq921227/chapter_23"), - ) def __init__(self, match): domain, path = match.groups() - ChapterExtractor.__init__(self, match, "https://" + domain + path) + super().__init__(match, "https://" + domain + path) self.session.headers['Referer'] = self.root - def metadata(self, page): - _ , pos = text.extract(page, '<a class="a-h" ', '/a>') - manga , pos = text.extract(page, '<a class="a-h" ', '/a>', pos) - info , pos = text.extract(page, '<a class="a-h" ', '/a>', pos) - author, pos = text.extract(page, '- Author(s) : ', '</p>', pos) - - manga, _ = text.extract(manga, '">', '<') - info , _ = text.extract(info , '">', '<') - match = re.match( - r"(?:[Vv]ol\. *(\d+) )?" - r"[Cc]hapter *([^:]*)" - r"(?:: *(.+))?", info) + self._match_chapter = re.compile( + r"(?:[Vv]ol\.?\s*(\d+)\s?)?" + r"[Cc]hapter\s*([^:]+)" + r"(?::\s*(.+))?").match + + def _parse_chapter(self, info, manga, author, date=None): + match = self._match_chapter(info) volume, chapter, title = match.groups() if match else ("", "", info) chapter, sep, minor = chapter.partition(".") return { - "manga" : text.unescape(manga), + "manga" : manga, + "author" : author, + "date" : date, "title" : text.unescape(title) if title else "", - "author" : text.unescape(author) if author else "", "volume" : text.parse_int(volume), "chapter" : text.parse_int(chapter), "chapter_minor": sep + minor, @@ -62,19 +44,53 @@ class ManganeloChapterExtractor(ChapterExtractor): "language" : "English", } + +class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): + """Extractor for manga chapters from manganelo.com""" + pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)" + test = ( + ("https://chapmanganato.com/manga-gn983696/chapter-23", { + "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/03/23" + r"/39/gn983696/vol_3_chapter_23_24_yen/\d+-[no]\.jpg", + "keyword": "17faaea7f0fb8c2675a327bf3aa0bcd7a6311d68", + "count": 25, + }), + ("https://chapmanganelo.com/manga-ti107776/chapter-4", { + "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/01/92" + r"/08/ti970565/chapter_4_caster/\d+-o\.jpg", + "keyword": "06e01fa9b3fc9b5b954c0d4a98f0153b40922ded", + "count": 45, + }), + ("https://readmanganato.com/manga-gn983696/chapter-23"), + ("https://manganelo.com/chapter/gamers/chapter_15"), + ("https://manganelo.com/chapter/gq921227/chapter_23"), + ) + + def metadata(self, page): + extr = text.extract_from(page) + extr('class="a-h"', ">") + manga = extr('title="', '"') + info = extr('title="', '"') + author = extr("- Author(s) : ", "</p>") + + return self._parse_chapter( + info, text.unescape(manga), text.unescape(author)) + def images(self, page): page = text.extr( page, 'class="container-chapter-reader', '\n<div') return [ (url, None) for url in text.extract_iter(page, '<img src="', '"') + ] or [ + (url, None) + for url in text.extract_iter( + page, '<img class="reader-content" src="', '"') ] -class ManganeloMangaExtractor(MangaExtractor): +class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): """Extractor for manga from manganelo.com""" - category = "manganelo" - root = "https://chapmanganato.com" chapterclass = ManganeloChapterExtractor pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$" test = ( @@ -82,40 +98,28 @@ class ManganeloMangaExtractor(MangaExtractor): "pattern": ManganeloChapterExtractor.pattern, "count": ">= 25", }), + ("https://m.manganelo.com/manga-ti107776", { + "pattern": ManganeloChapterExtractor.pattern, + "count": ">= 12", + }), ("https://readmanganato.com/manga-gn983696"), ("https://manganelo.com/manga/read_otome_no_teikoku"), ("https://manganelo.com/manga/ol921234/"), ) - def __init__(self, match): - domain, path = match.groups() - MangaExtractor.__init__(self, match, "https://" + domain + path) - self.session.headers['Referer'] = self.root - def chapters(self, page): results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) + append = results.append + + extr = text.extract_from(page) + manga = text.unescape(extr("<h1>", "<")) + author = text.remove_html(extr("</i>Author(s) :</td>", "</tr>")) - needle = 'class="chapter-name text-nowrap" href="' - pos = page.index('<ul class="row-content-chapter">') + extr('class="row-content-chapter', '') while True: - url, pos = text.extract(page, needle, '"', pos) + url = extr('class="chapter-name text-nowrap" href="', '"') if not url: return results - data["title"], pos = text.extract(page, '>', '</a>', pos) - data["date"] , pos = text.extract( - page, 'class="chapter-time text-nowrap" title="', '">', pos) - chapter, sep, minor = url.rpartition("/chapter_")[2].partition(".") - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = sep + minor - results.append((url, data.copy())) - - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '<h1>', '</h1>'), - ('author' , '</i>Author(s) :</td>', '</tr>'), - ), values=data) - data["author"] = text.remove_html(data["author"]) - return data + info = extr(">", "<") + date = extr('class="chapter-time text-nowrap" title="', '"') + append((url, self._parse_chapter(info, manga, author, date))) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index dcf1972..168fbe8 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://mangapark.net/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, exception -import json +from .. import text, util, exception import re @@ -104,7 +103,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): return data def images(self, page): - data = json.loads(text.extr(page, "var _load_pages =", ";")) + data = util.json_loads(text.extr(page, "var _load_pages =", ";")) return [ (text.urljoin(self.root, item["u"]), { "width": text.parse_int(item["w"]), diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index 5fa5631..b7070f2 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, util -import json class MangaseeBase(): @@ -43,6 +42,7 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): "pattern": r"https://[^/]+/manga/Tokyo-Innocent/0004\.5-00\d\.png", "count": 8, "keyword": { + "author": ["NARUMI Naru"], "chapter": 4, "chapter_minor": ".5", "chapter_string": "100045", @@ -50,6 +50,8 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): "date": "dt:2020-01-20 21:52:53", "extension": "png", "filename": r"re:0004\.5-00\d", + "genre": ["Comedy", "Fantasy", "Harem", "Romance", "Shounen", + "Supernatural"], "index": "1", "lang": "en", "language": "English", @@ -63,6 +65,7 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): "pattern": r"https://[^/]+/manga/One-Piece/1063-0\d\d\.png", "count": 13, "keyword": { + "author": ["ODA Eiichiro"], "chapter": 1063, "chapter_minor": "", "chapter_string": "110630", @@ -70,6 +73,8 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): "date": "dt:2022-10-16 17:32:54", "extension": "png", "filename": r"re:1063-0\d\d", + "genre": ["Action", "Adventure", "Comedy", "Drama", "Fantasy", + "Shounen"], "index": "1", "lang": "en", "language": "English", @@ -94,12 +99,16 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) - self.chapter = data = json.loads(extr("vm.CurChapter =", ";\r\n")) + author = util.json_loads(extr('"author":', '],') + "]") + genre = util.json_loads(extr('"genre":', '],') + "]") + self.chapter = data = util.json_loads(extr("vm.CurChapter =", ";\r\n")) self.domain = extr('vm.CurPathName = "', '"') self.slug = extr('vm.IndexName = "', '"') data = self._transform_chapter(data) data["manga"] = text.unescape(extr('vm.SeriesName = "', '"')) + data["author"] = author + data["genre"] = genre return data def images(self, page): @@ -128,10 +137,38 @@ class MangaseeMangaExtractor(MangaseeBase, MangaExtractor): "/Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai"), { "pattern": MangaseeChapterExtractor.pattern, "count": ">= 17", + "keyword": { + "author": ["TAKASE Masaya"], + "chapter": int, + "chapter_minor": r"re:^|\.5$", + "chapter_string": r"re:100\d\d\d", + "date": "type:datetime", + "genre": ["Comedy", "Romance", "School Life", "Shounen", + "Slice of Life"], + "index": "1", + "lang": "en", + "language": "English", + "manga": "Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai", + "title": "", + }, }), ("https://manga4life.com/manga/Ano-Musume-Ni-Kiss-To-Shirayuri-O", { "pattern": MangaseeChapterExtractor.pattern, "count": ">= 50", + "keyword": { + "author": ["Canno"], + "chapter": int, + "chapter_minor": r"re:^|\.5$", + "chapter_string": r"re:100\d\d\d", + "date": "type:datetime", + "genre": ["Comedy", "Romance", "School Life", "Seinen", + "Shoujo Ai"], + "index": "1", + "lang": "en", + "language": "English", + "manga": "Ano-Musume-Ni-Kiss-To-Shirayuri-O", + "title": "" + }, }), ) @@ -142,9 +179,11 @@ class MangaseeMangaExtractor(MangaseeBase, MangaExtractor): MangaExtractor.__init__(self, match, self.root + match.group(2)) def chapters(self, page): - slug, pos = text.extract(page, 'vm.IndexName = "', '"') - chapters = json.loads(text.extract( - page, "vm.Chapters = ", ";\r\n", pos)[0]) + extr = text.extract_from(page) + author = util.json_loads(extr('"author":', '],') + "]") + genre = util.json_loads(extr('"genre":', '],') + "]") + slug = extr('vm.IndexName = "', '"') + chapters = util.json_loads(extr("vm.Chapters = ", ";\r\n")) result = [] for data in map(self._transform_chapter, chapters): @@ -155,5 +194,7 @@ class MangaseeMangaExtractor(MangaseeBase, MangaExtractor): url += "-page-1.html" data["manga"] = slug + data["author"] = author + data["genre"] = genre result.append((url, data)) return result diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py new file mode 100644 index 0000000..03e9104 --- /dev/null +++ b/gallery_dl/extractor/misskey.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Misskey instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class MisskeyExtractor(BaseExtractor): + """Base class for Misskey extractors""" + basecategory = "misskey" + directory_fmt = ("misskey", "{instance}", "{user[username]}") + filename_fmt = "{category}_{id}_{file[id]}.{extension}" + archive_fmt = "{id}_{file[id]}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.api = MisskeyAPI(self) + self.instance = self.root.rpartition("://")[2] + self.item = match.group(match.lastindex) + self.renotes = self.config("renotes", False) + self.replies = self.config("replies", True) + + def items(self): + for note in self.notes(): + files = note.pop("files") or [] + renote = note.get("renote") + if renote: + if not self.renotes: + self.log.debug("Skipping %s (renote)", note["id"]) + continue + files.extend(renote.get("files") or ()) + + reply = note.get("reply") + if reply: + if not self.replies: + self.log.debug("Skipping %s (reply)", note["id"]) + continue + files.extend(reply.get("files") or ()) + + note["instance"] = self.instance + note["instance_remote"] = note["user"]["host"] + note["count"] = len(files) + note["date"] = text.parse_datetime( + note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") + + yield Message.Directory, note + for note["num"], file in enumerate(files, 1): + file["date"] = text.parse_datetime( + file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z") + note["file"] = file + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, note) + + def notes(self): + """Return an iterable containing all relevant Note objects""" + return () + + +BASE_PATTERN = MisskeyExtractor.update({ + "misskey.io": { + "root": "https://misskey.io", + "pattern": r"misskey\.io", + }, + "lesbian.energy": { + "root": "https://lesbian.energy", + "pattern": r"lesbian\.energy" + }, + "sushi.ski": { + "root": "https://sushi.ski", + "pattern": r"sushi\.ski", + }, +}) + + +class MisskeyUserExtractor(MisskeyExtractor): + """Extractor for all images of a Misskey user""" + subcategory = "user" + pattern = BASE_PATTERN + r"/@([^/?#]+)/?$" + test = ( + ("https://misskey.io/@lithla", { + "pattern": r"https://s\d+\.arkjp\.net/misskey/[\w-]+\.\w+", + "range": "1-50", + "count": 50, + }), + ("https://misskey.io/@blooddj@pawoo.net", { + "range": "1-50", + "count": 50, + }), + ("https://lesbian.energy/@rerorero", { + "pattern": r"https://lesbian.energy/files/\w+", + "range": "1-50", + "count": 50, + }), + ("https://lesbian.energy/@nano@mk.yopo.work"), + ("https://sushi.ski/@ui@misskey.04.si"), + ) + + def notes(self): + return self.api.users_notes(self.api.user_id_by_username(self.item)) + + +class MisskeyFollowingExtractor(MisskeyExtractor): + """Extractor for followed Misskey users""" + subcategory = "following" + pattern = BASE_PATTERN + r"/@([^/?#]+)/following" + test = ( + ("https://misskey.io/@blooddj@pawoo.net/following", { + "extractor": False, + "count": ">= 6", + }), + ("https://sushi.ski/@hatusimo_sigure/following"), + ) + + def items(self): + user_id = self.api.user_id_by_username(self.item) + for user in self.api.users_following(user_id): + user = user["followee"] + url = self.root + "/@" + user["username"] + host = user["host"] + if host is not None: + url += "@" + host + user["_extractor"] = MisskeyUserExtractor + yield Message.Queue, url, user + + +class MisskeyNoteExtractor(MisskeyExtractor): + """Extractor for images from a Note""" + subcategory = "note" + pattern = BASE_PATTERN + r"/notes/(\w+)" + test = ( + ("https://misskey.io/notes/9bhqfo835v", { + "pattern": r"https://s\d+\.arkjp\.net/misskey/[\w-]+\.\w+", + "count": 4, + }), + ("https://misskey.io/notes/9brq7z1re6"), + ("https://sushi.ski/notes/9bm3x4ksqw", { + "pattern": r"https://media\.sushi\.ski/files/[\w-]+\.png", + "count": 1, + }), + ("https://lesbian.energy/notes/995ig09wqy", { + "count": 1, + }), + ("https://lesbian.energy/notes/96ynd9w5kc"), + ) + + def notes(self): + return (self.api.notes_show(self.item),) + + +class MisskeyAPI(): + """Interface for Misskey API + + https://github.com/misskey-dev/misskey + https://misskey-hub.net/en/docs/api/ + https://misskey-hub.net/docs/api/endpoints.html + """ + + def __init__(self, extractor): + self.root = extractor.root + self.extractor = extractor + self.headers = {"Content-Type": "application/json"} + + def user_id_by_username(self, username): + endpoint = "/users/show" + data = {"username": username} + if "@" in username: + data["username"], _, data["host"] = username.partition("@") + return self._call(endpoint, data)["id"] + + def users_following(self, user_id): + endpoint = "/users/following" + data = {"userId": user_id} + return self._pagination(endpoint, data) + + def users_notes(self, user_id): + endpoint = "/users/notes" + data = {"userId": user_id} + return self._pagination(endpoint, data) + + def notes_show(self, note_id): + endpoint = "/notes/show" + data = {"noteId": note_id} + return self._call(endpoint, data) + + def _call(self, endpoint, data): + url = self.root + "/api" + endpoint + return self.extractor.request( + url, method="POST", headers=self.headers, json=data).json() + + def _pagination(self, endpoint, data): + data["limit"] = 100 + while True: + notes = self._call(endpoint, data) + if not notes: + return + yield from notes + data["untilId"] = notes[-1]["id"] diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py index 1db83b0..0f79d7f 100644 --- a/gallery_dl/extractor/nana.py +++ b/gallery_dl/extractor/nana.py @@ -7,8 +7,7 @@ """Extractors for https://nana.my.id/""" from .common import GalleryExtractor, Extractor, Message -from .. import text, exception -import json +from .. import text, util, exception class NanaGalleryExtractor(GalleryExtractor): @@ -59,7 +58,7 @@ class NanaGalleryExtractor(GalleryExtractor): } def images(self, page): - data = json.loads(text.extr(page, "Reader.pages = ", ".pages")) + data = util.json_loads(text.extr(page, "Reader.pages = ", ".pages")) return [ ("https://nana.my.id" + image, None) for image in data["pages"] diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 1f96879..2b759ec 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,10 +9,9 @@ """Extractors for https://www.newgrounds.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache import itertools -import json class NewgroundsExtractor(Extractor): @@ -20,7 +19,7 @@ class NewgroundsExtractor(Extractor): category = "newgrounds" directory_fmt = ("{category}", "{artist[:10]:J, }") filename_fmt = "{category}_{_index}_{title}.{extension}" - archive_fmt = "{_index}" + archive_fmt = "{_type}{_index}" root = "https://www.newgrounds.com" cookiedomain = ".newgrounds.com" cookienames = ("NG_GG_username", "vmk1du5I8m") @@ -151,11 +150,13 @@ class NewgroundsExtractor(Extractor): @staticmethod def _extract_image_data(extr, url): - full = text.extract_from(json.loads(extr('"full_image_text":', '});'))) + full = text.extract_from(util.json_loads(extr( + '"full_image_text":', '});'))) data = { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), "type" : extr('og:type" content="', '"'), + "_type" : "i", "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), @@ -175,6 +176,7 @@ class NewgroundsExtractor(Extractor): "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), "type" : extr('og:type" content="', '"'), + "_type" : "a", "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "url" : extr('{"url":"', '"').replace("\\/", "/"), @@ -227,6 +229,7 @@ class NewgroundsExtractor(Extractor): "url" : src, "date" : date, "type" : type, + "_type" : "", "description": text.unescape(descr or extr( 'itemprop="description" content="', '"')), "rating" : extr('class="rated-', '"'), diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index 9df43e5..4270c84 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, util import collections -import json class NhentaiGalleryExtractor(GalleryExtractor): @@ -48,7 +47,7 @@ class NhentaiGalleryExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - self.data = data = json.loads(page) + self.data = data = util.json_loads(page) title_en = data["title"].get("english", "") title_ja = data["title"].get("japanese", "") diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index f9c6abf..9b69694 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -59,10 +59,7 @@ class NitterExtractor(BaseExtractor): if url[0] == "/": url = self.root + url - file = { - "url": url, - "_http_retry_codes": (404,), - } + file = {"url": url, "_http_retry": _retry_on_404} file["filename"], _, file["extension"] = \ name.rpartition(".") append(file) @@ -220,10 +217,6 @@ BASE_PATTERN = NitterExtractor.update({ "root": "https://nitter.lacontrevoie.fr", "pattern": r"nitter\.lacontrevoie\.fr", }, - "nitter.pussthecat.org": { - "root": "https://nitter.pussthecat.org", - "pattern": r"nitter\.pussthecat\.org", - }, "nitter.1d4.us": { "root": "https://nitter.1d4.us", "pattern": r"nitter\.1d4\.us", @@ -283,13 +276,12 @@ class NitterTweetsExtractor(NitterExtractor): }, }, }), - ("https://nitter.pussthecat.org/i/user/2976459548", { - "url": "c740a2683db2c8ed2f350afc0494475c4444025b", - "pattern": r"https://nitter.pussthecat\.org/pic/orig" + ("https://nitter.lacontrevoie.fr/supernaturepics", { + "url": "54f4b55f2099dcc248f3fb7bfacf1349e08d8e2d", + "pattern": r"https://nitter\.lacontrevoie\.fr/pic/orig" r"/media%2FCGMNYZvW0AIVoom\.jpg", "range": "1", }), - ("https://nitter.lacontrevoie.fr/supernaturepics"), ("https://nitter.1d4.us/supernaturepics"), ("https://nitter.kavin.rocks/id:2976459548"), ("https://nitter.unixfox.eu/supernaturepics"), @@ -309,7 +301,6 @@ class NitterRepliesExtractor(NitterExtractor): "range": "1-20", }), ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"), - ("https://nitter.pussthecat.org/supernaturepics/with_replies"), ("https://nitter.1d4.us/supernaturepics/with_replies"), ("https://nitter.kavin.rocks/id:2976459548/with_replies"), ("https://nitter.unixfox.eu/i/user/2976459548/with_replies"), @@ -334,7 +325,6 @@ class NitterMediaExtractor(NitterExtractor): "range": "1-20", }), ("https://nitter.lacontrevoie.fr/supernaturepics/media"), - ("https://nitter.pussthecat.org/supernaturepics/media"), ("https://nitter.1d4.us/supernaturepics/media"), ("https://nitter.unixfox.eu/i/user/2976459548/media"), ) @@ -353,7 +343,6 @@ class NitterSearchExtractor(NitterExtractor): "range": "1-20", }), ("https://nitter.lacontrevoie.fr/supernaturepics/search"), - ("https://nitter.pussthecat.org/supernaturepics/search"), ("https://nitter.1d4.us/supernaturepics/search"), ("https://nitter.kavin.rocks/id:2976459548/search"), ("https://nitter.unixfox.eu/i/user/2976459548/search"), @@ -375,7 +364,7 @@ class NitterTweetExtractor(NitterExtractor): "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", "keyword": { - "comments": 16, + "comments": 19, "content": "Big Wedeene River, Canada", "count": 1, "date": "dt:2015-05-29 17:40:00", @@ -399,9 +388,9 @@ class NitterTweetExtractor(NitterExtractor): "url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff", }), # video - ("https://nitter.pussthecat.org/i/status/1065692031626829824", { - "pattern": r"ytdl:https://nitter.pussthecat.org/video" - r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F" + ("https://nitter.lacontrevoie.fr/i/status/1065692031626829824", { + "pattern": r"ytdl:https://nitter\.lacontrevoie\.fr/video" + r"/[0-9A-F]{10,}/https%3A%2F%2Fvideo.twimg.com%2F" r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F" r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", "keyword": { @@ -446,7 +435,7 @@ class NitterTweetExtractor(NitterExtractor): "count": 0, }), # "Misleading" content - ("https://nitter.pussthecat.org/i/status/1486373748911575046", { + ("https://nitter.lacontrevoie.fr/i/status/1486373748911575046", { "count": 4, }), # age-restricted (#2354) @@ -468,3 +457,7 @@ class NitterTweetExtractor(NitterExtractor): quoted["user"] = tweet["user"] return (tweet, quoted) return (tweet,) + + +def _retry_on_404(response): + return response.status_code == 404 diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 9270f33..ec46ca3 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -9,13 +9,12 @@ """Utility classes to setup OAuth and link accounts to gallery-dl""" from .common import Extractor, Message -from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr from .. import text, oauth, util, config, exception from ..output import stdout_write from ..cache import cache import urllib.parse +import binascii import hashlib -import base64 REDIRECT_URI_LOCALHOST = "http://localhost:6414/" REDIRECT_URI_HTTPS = "https://mikf.github.io/gallery-dl/oauth-redirect.html" @@ -76,7 +75,8 @@ class OAuthBase(Extractor): browser = webbrowser.get() if browser and browser.open(url): - self.log.info("Opening URL in %s:", browser.name.capitalize()) + name = getattr(browser, "name", "Browser") + self.log.info("Opening URL in %s:", name.capitalize()) else: self.log.info("Please open this URL in your browser:") @@ -242,6 +242,7 @@ class OAuthFlickr(OAuthBase): def items(self): yield Message.Version, 1 + from . import flickr self._oauth1_authorization_flow( flickr.FlickrAPI.API_KEY, @@ -258,6 +259,7 @@ class OAuthSmugmug(OAuthBase): def items(self): yield Message.Version, 1 + from . import smugmug self._oauth1_authorization_flow( smugmug.SmugmugAPI.API_KEY, @@ -274,6 +276,7 @@ class OAuthTumblr(OAuthBase): def items(self): yield Message.Version, 1 + from . import tumblr self._oauth1_authorization_flow( tumblr.TumblrAPI.API_KEY, @@ -294,6 +297,7 @@ class OAuthDeviantart(OAuthBase): def items(self): yield Message.Version, 1 + from . import deviantart self._oauth2_authorization_code_grant( self.oauth_config("client-id"), @@ -313,6 +317,7 @@ class OAuthReddit(OAuthBase): def items(self): yield Message.Version, 1 + from . import reddit self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT self._oauth2_authorization_code_grant( @@ -337,6 +342,7 @@ class OAuthMastodon(OAuthBase): def items(self): yield Message.Version, 1 + from . import mastodon for application in mastodon.INSTANCES.values(): if self.instance == application["root"].partition("://")[2]: @@ -389,11 +395,12 @@ class OAuthPixiv(OAuthBase): def items(self): yield Message.Version, 1 + from . import pixiv code_verifier = util.generate_token(32) - digest = hashlib.sha256(code_verifier.encode("ascii")).digest() - code_challenge = base64.urlsafe_b64encode( - digest).rstrip(b"=").decode("ascii") + digest = hashlib.sha256(code_verifier.encode()).digest() + code_challenge = binascii.b2a_base64( + digest)[:-2].decode().replace("+", "-").replace("/", "_") url = "https://app-api.pixiv.net/web/v1/login" params = { diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 1f520c3..e4bfa2a 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,11 +9,10 @@ """Extractors for https://www.patreon.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import memcache import collections import itertools -import json class PatreonExtractor(Extractor): @@ -251,7 +250,7 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - return json.loads(text.extr( + return util.json_loads(text.extr( page, "window.patreon.bootstrap,", "\n});") + "}") diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index 375b5e3..6234e6a 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, exception -import base64 +import binascii import json @@ -168,7 +168,7 @@ class PhotobucketImageExtractor(Extractor): image["titleOrFilename"] = image["title"] or name image["tags"] = image.pop("clarifaiTagList", []) - mtype, _, mid = base64.b64decode(image["id"]).partition(b":") + mtype, _, mid = binascii.a2b_base64(image["id"]).partition(b":") image["pictureId"] = mid.decode() if mtype == b"mediaId" else "" yield Message.Directory, image diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 63b16ce..31ddbcc 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache import itertools -import json BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" @@ -309,7 +308,7 @@ class PinterestSearchExtractor(PinterestExtractor): def __init__(self, match): PinterestExtractor.__init__(self, match) - self.search = match.group(1) + self.search = text.unquote(match.group(1)) def metadata(self): return {"search": self.search} @@ -504,7 +503,10 @@ class PinterestAPI(): "username_or_email": username, "password" : password, } - data = {"data": json.dumps({"options": options}), "source_url": ""} + data = { + "data" : util.json_dumps({"options": options}), + "source_url": "", + } try: response = self.extractor.request( @@ -523,7 +525,10 @@ class PinterestAPI(): def _call(self, resource, options): url = "{}/resource/{}Resource/get/".format(self.root, resource) - params = {"data": json.dumps({"options": options}), "source_url": ""} + params = { + "data" : util.json_dumps({"options": options}), + "source_url": "", + } response = self.extractor.request( url, params=params, headers=self.headers, diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 535fae9..4135259 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,10 +9,8 @@ """Extractors for https://www.plurk.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception import datetime -import time -import json import re @@ -20,6 +18,7 @@ class PlurkExtractor(Extractor): """Base class for plurk extractors""" category = "plurk" root = "https://www.plurk.com" + request_interval = 1.0 def items(self): urls = self._urls_ex if self.config("comments", False) else self._urls @@ -59,14 +58,13 @@ class PlurkExtractor(Extractor): return elif info["has_newer"] < 200: del data["count"] - time.sleep(1) data["from_response_id"] = info["responses"][-1]["id"] + 1 @staticmethod def _load(data): if not data: raise exception.NotFoundError("user") - return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data)) + return util.json_loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data)) class PlurkTimelineExtractor(PlurkExtractor): diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index c35ee74..49da9ce 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -79,7 +79,7 @@ class PoipikuExtractor(Extractor): page = self.request( url, method="POST", headers=headers, data=data).json()["html"] - if page.startswith("You need to"): + if page.startswith(("You need to", "Password is incorrect")): self.log.warning("'%s'", page) for thumb in text.extract_iter( diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py new file mode 100644 index 0000000..783f3da --- /dev/null +++ b/gallery_dl/extractor/pornpics.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.pornpics.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?pornpics\.com(?:/\w\w)?" + + +class PornpicsExtractor(Extractor): + """Base class for pornpics extractors""" + category = "pornpics" + root = "https://www.pornpics.com" + request_interval = (0.5, 1.5) + + def __init__(self, match): + super().__init__(match) + self.item = match.group(1) + self.session.headers["Referer"] = self.root + + def items(self): + for gallery in self.galleries(): + gallery["_extractor"] = PornpicsGalleryExtractor + yield Message.Queue, gallery["g_url"], gallery + + def _pagination(self, url, params=None): + if params is None: + # fetch first 20 galleries from HTML + # since '"offset": 0' does not return a JSON response + page = self.request(url).text + for path in text.extract_iter( + page, 'class="rel-link" href="', '"'): + yield {"g_url": self.root + path} + del page + params = {"offset": 20} + + limit = params["limit"] = 20 + + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "Referer": url if params["offset"] else self.root + "/", + "X-Requested-With": "XMLHttpRequest", + } + + while True: + galleries = self.request( + url, params=params, headers=headers).json() + yield from galleries + + if len(galleries) < limit: + return + params["offset"] += limit + + +class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor): + """Extractor for pornpics galleries""" + pattern = BASE_PATTERN + r"(/galleries/(?:[^/?#]+-)?(\d+))" + test = ( + (("https://www.pornpics.com/galleries/british-beauty-danielle-flashes-" + "hot-breasts-ass-and-snatch-in-the-forest-62610699/"), { + "pattern": r"https://cdni\.pornpics\.com/1280/7/160/62610699" + r"/62610699_\d+_[0-9a-f]{4}\.jpg", + "keyword": { + "categories": ["MILF", "Amateur", "Sexy", "Outdoor"], + "channel": "FTV MILFs", + "count": 17, + "gallery_id": 62610699, + "models": ["Danielle"], + "num": int, + "slug": "british-beauty-danielle-flashes-" + "hot-breasts-ass-and-snatch-in-the-forest", + "tags": ["Amateur MILF", "Sexy MILF"], + "title": "British beauty Danielle flashes " + "hot breasts, ass and snatch in the forest", + "views": int, + }, + }), + ("https://pornpics.com/es/galleries/62610699", { + "keyword": { + "slug": "british-beauty-danielle-flashes-" + "hot-breasts-ass-and-snatch-in-the-forest", + }, + }), + ) + + def __init__(self, match): + PornpicsExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + items = GalleryExtractor.items + + def metadata(self, page): + extr = text.extract_from(page) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "slug" : extr("/galleries/", "/").rpartition("-")[0], + "title" : text.unescape(extr("<h1>", "<")), + "channel" : extr('>Channel:', '</a>').rpartition(">")[2], + "models" : text.split_html(extr( + ">Models:", '<span class="suggest')), + "categories": text.split_html(extr( + ">Categories:", '<span class="suggest')), + "tags" : text.split_html(extr( + ">Tags List:", ' </div>')), + "views" : text.parse_int(extr(">Views:", "<").replace(",", "")), + } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter(page, "class='rel-link' href='", "'") + ] + + +class PornpicsTagExtractor(PornpicsExtractor): + """Extractor for galleries from pornpics tag searches""" + subcategory = "tag" + pattern = BASE_PATTERN + r"/tags/([^/?#]+)" + test = ( + ("https://www.pornpics.com/tags/summer-dress/", { + "pattern": PornpicsGalleryExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://pornpics.com/fr/tags/summer-dress"), + ) + + def galleries(self): + url = "{}/tags/{}/".format(self.root, self.item) + return self._pagination(url) + + +class PornpicsSearchExtractor(PornpicsExtractor): + """Extractor for galleries from pornpics search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/(?:\?q=|pornstars/|channels/)([^/&#]+)" + test = ( + ("https://www.pornpics.com/?q=nature", { + "pattern": PornpicsGalleryExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://www.pornpics.com/channels/femjoy/", { + "pattern": PornpicsGalleryExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://www.pornpics.com/pornstars/emma-brown/", { + "pattern": PornpicsGalleryExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://pornpics.com/jp/?q=nature"), + ("https://pornpics.com/it/channels/femjoy"), + ("https://pornpics.com/pt/pornstars/emma-brown"), + ) + + def galleries(self): + url = self.root + "/search/srch.php" + params = { + "q" : self.item.replace("-", " "), + "lang" : "en", + "offset": 0, + } + return self._pagination(url, params) diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index 7e266cc..32567f6 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,6 @@ from .common import GalleryExtractor from .. import text, util import binascii -import json class PururinGalleryExtractor(GalleryExtractor): @@ -73,7 +72,7 @@ class PururinGalleryExtractor(GalleryExtractor): url = "{}/read/{}/01/x".format(self.root, self.gallery_id) page = self.request(url).text - info = json.loads(binascii.a2b_base64(text.extr( + info = util.json_loads(binascii.a2b_base64(text.extr( page, '<gallery-read encoded="', '"')).decode()) self._ext = info["image_extension"] self._cnt = info["total_pages"] diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 8b5b6b6..1800b68 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,9 +9,8 @@ """Generic extractors for *reactor sites""" from .common import BaseExtractor, Message -from .. import text +from .. import text, util import urllib.parse -import json class ReactorExtractor(BaseExtractor): @@ -84,13 +83,13 @@ class ReactorExtractor(BaseExtractor): script = script[:script.index("</")].strip() try: - data = json.loads(script) + data = util.json_loads(script) except ValueError: try: # remove control characters and escape backslashes mapping = dict.fromkeys(range(32)) script = script.translate(mapping).replace("\\", "\\\\") - data = json.loads(script) + data = util.json_loads(script) except ValueError as exc: self.log.warning("Unable to parse JSON data: %s", exc) return diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 204562e..305de2a 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -29,7 +29,14 @@ class RedditExtractor(Extractor): parentdir = self.config("parent-directory") max_depth = self.config("recursion", 0) + videos = self.config("videos", True) + if videos: + if videos == "ytdl": + self._extract_video = self._extract_video_ytdl + elif videos == "dash": + self._extract_video = self._extract_video_dash + videos = True submissions = self.submissions() visited = set() @@ -62,19 +69,8 @@ class RedditExtractor(Extractor): elif submission["is_video"]: if videos: text.nameext_from_url(url, submission) - if videos == "ytdl": - url = "https://www.reddit.com" + \ - submission["permalink"] - else: - submission["_ytdl_extra"] = { - "title": submission["title"], - } - try: - url = (submission["secure_media"] - ["reddit_video"]["dash_url"]) - except (KeyError, TypeError): - pass - yield Message.Url, "ytdl:" + url, submission + url = "ytdl:" + self._extract_video(submission) + yield Message.Url, url, submission elif not submission["is_self"]: urls.append((url, submission)) @@ -145,6 +141,21 @@ class RedditExtractor(Extractor): submission["id"], item["media_id"]) self.log.debug(src) + def _extract_video_ytdl(self, submission): + return "https://www.reddit.com" + submission["permalink"] + + def _extract_video_dash(self, submission): + submission["_ytdl_extra"] = {"title": submission["title"]} + try: + return (submission["secure_media"]["reddit_video"]["dash_url"] + + "#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D") + except Exception: + return submission["url"] + + def _extract_video(self, submission): + submission["_ytdl_extra"] = {"title": submission["title"]} + return submission["url"] + class RedditSubredditExtractor(RedditExtractor): """Extractor for URLs from subreddits on reddit.com""" @@ -233,6 +244,25 @@ class RedditSubmissionExtractor(RedditExtractor): "content": "1e7dde4ee7d5f4c4b45749abfd15b2dbfa27df3f", "count": 3, }), + # video + ("https://www.reddit.com/r/aww/comments/90bu6w/", { + "pattern": r"ytdl:https://v.redd.it/gyh95hiqc0b11", + "count": 1, + }), + # video (ytdl) + ("https://www.reddit.com/r/aww/comments/90bu6w/", { + "options": (("videos", "ytdl"),), + "pattern": r"ytdl:https://www.reddit.com/r/aww/comments/90bu6w" + r"/heat_index_was_110_degrees_so_we_offered_him_a/", + "count": 1, + }), + # video (dash) + ("https://www.reddit.com/r/aww/comments/90bu6w/", { + "options": (("videos", "dash"),), + "pattern": r"ytdl:https://v.redd.it/gyh95hiqc0b11" + r"/DASHPlaylist.mpd\?a=", + "count": 1, + }), # deleted gallery (#953) ("https://www.reddit.com/gallery/icfgzv", { "count": 0, diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index ad4282c..eaaef7d 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -23,6 +23,7 @@ class RedgifsExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.key = match.group(1) + self.api = RedgifsAPI(self) formats = self.config("format") if formats is None: @@ -69,30 +70,89 @@ class RedgifsUserExtractor(RedgifsExtractor): """Extractor for redgifs user profiles""" subcategory = "user" directory_fmt = ("{category}", "{userName}") - pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?#]+)" - test = ("https://www.redgifs.com/users/Natalifiction", { - "pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4", - "count": ">= 100", - }) + pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?$" + test = ( + ("https://www.redgifs.com/users/Natalifiction", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "count": ">= 100", + }), + ("https://v3.redgifs.com/users/lamsinka89", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.(mp4|jpg)", + "count": ">= 100", + }), + ) def metadata(self): return {"userName": self.key} def gifs(self): - return RedgifsAPI(self).user(self.key) + return self.api.user(self.key) + + +class RedgifsCollectionExtractor(RedgifsExtractor): + """Extractor for an individual user collection""" + subcategory = "collection" + directory_fmt = ("{category}", "{userName}", "{folderName}") + archive_fmt = "{folderId}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/users" + r"/([^/?#]+)/collections/([^/?#]+)") + test = ( + ("https://www.redgifs.com/users/boombah123/collections/2631326bbd", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "range": "1-20", + "count": 20, + }), + ("https://www.redgifs.com/users/boombah123/collections/9e6f7dd41f", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "range": "1-20", + "count": 20, + }), + ) + + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.collection_id = match.group(2) + + def metadata(self): + data = {"userName": self.key} + data.update(self.api.collection_info(self.key, self.collection_id)) + return data + + def gifs(self): + return self.api.collection(self.key, self.collection_id) + + +class RedgifsCollectionsExtractor(RedgifsExtractor): + """Extractor for redgifs user collections""" + subcategory = "collections" + pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/users" + r"/([^/?#]+)/collections/?$") + test = ("https://www.redgifs.com/users/boombah123/collections", { + "pattern": (r"https://www\.redgifs\.com/users" + r"/boombah123/collections/\w+"), + "count": ">= 3", + }) + + def items(self): + for collection in self.api.collections(self.key): + url = "{}/users/{}/collections/{}".format( + self.root, self.key, collection["folderId"]) + collection["_extractor"] = RedgifsCollectionExtractor + yield Message.Queue, url, collection class RedgifsSearchExtractor(RedgifsExtractor): """Extractor for redgifs search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)" + pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/browse/?\?([^#]+)" test = ( ("https://www.redgifs.com/browse?tags=JAV", { "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", "range": "1-10", "count": 10, }), + ("https://v3.redgifs.com/browse?tags=JAV"), ("https://www.redgifs.com/browse?type=i&verified=y&order=top7"), ) @@ -102,14 +162,14 @@ class RedgifsSearchExtractor(RedgifsExtractor): return {"search": search} def gifs(self): - return RedgifsAPI(self).search(self.params) + return self.api.search(self.params) class RedgifsImageExtractor(RedgifsExtractor): """Extractor for individual gifs from redgifs.com""" subcategory = "image" pattern = (r"(?:https?://)?(?:" - r"(?:www\.)?redgifs\.com/(?:watch|ifr)|" + r"(?:\w+\.)?redgifs\.com/(?:watch|ifr)|" r"(?:www\.)?gifdeliverynetwork\.com|" r"i\.redgifs\.com/i)/([A-Za-z]+)") test = ( @@ -121,13 +181,16 @@ class RedgifsImageExtractor(RedgifsExtractor): ("https://redgifs.com/ifr/FoolishForkedAbyssiniancat"), ("https://i.redgifs.com/i/FoolishForkedAbyssiniancat"), ("https://www.gifdeliverynetwork.com/foolishforkedabyssiniancat"), + ("https://v3.redgifs.com/watch/FoolishForkedAbyssiniancat"), ) def gifs(self): - return (RedgifsAPI(self).gif(self.key),) + return (self.api.gif(self.key),) class RedgifsAPI(): + """https://api.redgifs.com/docs/index.html""" + API_ROOT = "https://api.redgifs.com" def __init__(self, extractor): @@ -149,6 +212,19 @@ class RedgifsAPI(): params = {"order": order} return self._pagination(endpoint, params) + def collection(self, user, collection_id): + endpoint = "/v2/users/{}/collections/{}/gifs".format( + user, collection_id) + return self._pagination(endpoint) + + def collection_info(self, user, collection_id): + endpoint = "/v2/users/{}/collections/{}".format(user, collection_id) + return self._call(endpoint) + + def collections(self, user): + endpoint = "/v2/users/{}/collections".format(user) + return self._pagination(endpoint, key="collections") + def search(self, params): endpoint = "/v2/gifs/search" params["search_text"] = params.pop("tags", None) @@ -161,12 +237,14 @@ class RedgifsAPI(): return self.extractor.request( url, params=params, headers=self.headers).json() - def _pagination(self, endpoint, params): + def _pagination(self, endpoint, params=None, key="gifs"): + if params is None: + params = {} params["page"] = 1 while True: data = self._call(endpoint, params) - yield from data["gifs"] + yield from data[key] if params["page"] >= data["pages"]: return diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index f2bf3cb..278ad14 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -63,6 +63,10 @@ BASE_PATTERN = ShopifyExtractor.update({ "root": "https://modcloth.com", "pattern": r"modcloth\.com", }, + "ohpolly": { + "root": "https://www.ohpolly.com", + "pattern": r"(?:www\.)?ohpolly\.com", + }, "omgmiamiswimwear": { "root": "https://www.omgmiamiswimwear.com", "pattern": r"(?:www\.)?omgmiamiswimwear\.com", @@ -102,6 +106,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor): ("https://loungeunderwear.com/collections/apparel"), ("https://michaels.com.au/collections/microphones"), ("https://modcloth.com/collections/shoes"), + ("https://www.ohpolly.com/collections/dresses-mini-dresses"), ("https://www.omgmiamiswimwear.com/collections/fajas"), ("https://pinupgirlclothing.com/collections/evening"), ("https://www.raidlondon.com/collections/flats"), @@ -141,6 +146,8 @@ class ShopifyProductExtractor(ShopifyExtractor): ("https://michaels.com.au/collections/audio/products" "/boya-by-wm4-pro-k5-2-4ghz-mic-android-1-1-101281"), ("https://modcloth.com/collections/shoes/products/heidii-brn"), + (("https://www.ohpolly.com/products/edonia-ruched-triangle-cup" + "-a-line-mini-dress-brown")), ("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", { "pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/", "count": 5, diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index 506db26..bea457f 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann, Leonardo Taccari +# Copyright 2016-2023 Mike Fährmann, Leonardo Taccari # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://www.slideshare.net/""" from .common import GalleryExtractor -from .. import text -import json +from .. import text, util class SlidesharePresentationExtractor(GalleryExtractor): @@ -97,7 +96,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): @staticmethod def images(page): - data = json.loads(text.extract( + data = util.json_loads(text.extract( page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) # useing 'stripped_title' here is technically wrong, but it works all diff --git a/gallery_dl/extractor/soundgasm.py b/gallery_dl/extractor/soundgasm.py index 1afb92c..236f94f 100644 --- a/gallery_dl/extractor/soundgasm.py +++ b/gallery_dl/extractor/soundgasm.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,17 +11,46 @@ from .common import Extractor, Message from .. import text +BASE_PATTERN = r"(?:https?://)?(?:www\.)?soundgasm\.net/u(?:ser)?" -class SoundgasmAudioExtractor(Extractor): - """Extractor for audio clips from soundgasm.net""" + +class SoundgasmExtractor(Extractor): + """Base class for soundgasm extractors""" category = "soundgasm" - subcategory = "audio" root = "https://soundgasm.net" + request_interval = (0.5, 1.5) directory_fmt = ("{category}", "{user}") filename_fmt = "{title}.{extension}" archive_fmt = "{user}_{slug}" - pattern = (r"(?:https?://)?(?:www\.)?soundgasm\.net" - r"/u(?:ser)?/([^/?#]+)/([^/?#]+)") + + def items(self): + for sound in map(self._extract_sound, self.sounds()): + url = sound["url"] + yield Message.Directory, sound + yield Message.Url, url, text.nameext_from_url(url, sound) + + def _extract_sound(self, url): + extr = text.extract_from(self.request(url).text) + + _, user, slug = url.rstrip("/").rsplit("/", 2) + data = { + "user" : user, + "slug" : slug, + "title": text.unescape(extr('aria-label="title">', "<")), + "description": text.unescape(text.remove_html(extr( + 'class="jp-description">', '</div>'))), + } + + formats = extr('"setMedia", {', '}') + data["url"] = text.extr(formats, ': "', '"') + + return data + + +class SoundgasmAudioExtractor(SoundgasmExtractor): + """Extractor for audio clips from soundgasm.net""" + subcategory = "audio" + pattern = BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)" test = ( (("https://soundgasm.net/u/ClassWarAndPuppies2" "/687-Otto-von-Toontown-12822"), { @@ -47,47 +76,39 @@ class SoundgasmAudioExtractor(Extractor): ) def __init__(self, match): - Extractor.__init__(self, match) + SoundgasmExtractor.__init__(self, match) self.user, self.slug = match.groups() - def items(self): - url = "{}/u/{}/{}".format(self.root, self.user, self.slug) - extr = text.extract_from(self.request(url).text) + def sounds(self): + return ("{}/u/{}/{}".format(self.root, self.user, self.slug),) - data = { - "user" : self.user, - "slug" : self.slug, - "title": text.unescape(extr('aria-label="title">', "<")), - "description": text.unescape(text.remove_html(extr( - 'class="jp-description">', '</div>'))), - } - - formats = extr('"setMedia", {', '}') - url = text.extr(formats, ': "', '"') - - yield Message.Directory, data - yield Message.Url, url, text.nameext_from_url(url, data) - -class SoundgasmUserExtractor(Extractor): +class SoundgasmUserExtractor(SoundgasmExtractor): """Extractor for all sounds from a soundgasm user""" - category = "soundgasm" subcategory = "user" - root = "https://soundgasm.net" - pattern = (r"(?:https?://)?(?:www\.)?soundgasm\.net" - r"/u(?:ser)?/([^/?#]+)/?$") + pattern = BASE_PATTERN + r"/([^/?#]+)/?$" test = ("https://soundgasm.net/u/fierce-aphrodite", { - "pattern": SoundgasmAudioExtractor.pattern, + "pattern": r"https://media\.soundgasm\.net/sounds/[0-9a-f]{40}\.m4a", "count" : ">= 15", + "keyword": { + "description": str, + "extension": "m4a", + "filename": "re:^[0-9a-f]{40}$", + "slug": str, + "title": str, + "url": str, + "user": "fierce-aphrodite" + }, }) def __init__(self, match): - Extractor.__init__(self, match) + SoundgasmExtractor.__init__(self, match) self.user = match.group(1) - def items(self): + def sounds(self): page = self.request(self.root + "/user/" + self.user).text - data = {"_extractor": SoundgasmAudioExtractor} - for sound in text.extract_iter( - page, 'class="sound-details">', "</a>"): - yield Message.Queue, text.extr(sound, '<a href="', '"'), data + return [ + text.extr(sound, '<a href="', '"') + for sound in text.extract_iter( + page, 'class="sound-details">', "</a>") + ] diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index ea39c5e..4de7e9b 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,9 +9,8 @@ """Extractors for https://www.subscribestar.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache -import json BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)" @@ -92,7 +91,7 @@ class SubscribestarExtractor(Extractor): gallery = text.extr(html, 'data-gallery="', '"') if gallery: media.extend( - item for item in json.loads(text.unescape(gallery)) + item for item in util.json_loads(text.unescape(gallery)) if "/previews/" not in item["url"] ) diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py new file mode 100644 index 0000000..4b15b14 --- /dev/null +++ b/gallery_dl/extractor/szurubooru.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for szurubooru instances""" + +from . import booru +from .. import text + +import collections +import binascii + + +class SzurubooruExtractor(booru.BooruExtractor): + basecategory = "szurubooru" + filename_fmt = "{id}_{version}_{checksumMD5}.{extension}" + per_page = 100 + + def __init__(self, match): + booru.BooruExtractor.__init__(self, match) + self.headers = { + "Accept": "application/json", + "Content-Type": "application/json", + } + + username = self.config("username") + if username: + token = self.config("token") + if token: + value = username + ":" + token + self.headers["Authorization"] = "Token " + \ + binascii.b2a_base64(value.encode())[:-1].decode() + + def _api_request(self, endpoint, params=None): + url = self.root + "/api" + endpoint + return self.request(url, headers=self.headers, params=params).json() + + def _pagination(self, endpoint, params): + params["offset"] = 0 + params["limit"] = self.per_page + + while True: + data = self._api_request(endpoint, params) + results = data["results"] + + yield from results + + if len(results) < self.per_page: + return + params["offset"] += len(results) + + def _file_url(self, post): + url = post["contentUrl"] + if not url.startswith("http"): + url = self.root + "/" + url + return url + + @staticmethod + def _prepare(post): + post["date"] = text.parse_datetime( + post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ") + + tags = [] + append = tags.append + tags_categories = collections.defaultdict(list) + + for tag in post["tags"]: + tag_type = tag["category"].rpartition("_")[2] + tag_name = tag["names"][0] + tags_categories[tag_type].append(tag_name) + append(tag_name) + + post["tags"] = tags + for category, tags in tags_categories.items(): + post["tags_" + category] = tags + + +BASE_PATTERN = SzurubooruExtractor.update({ + "foalcon": { + "root": "https://booru.foalcon.com", + "pattern": r"booru\.foalcon\.com", + }, + "bcbnsfw": { + "root": "https://booru.bcbnsfw.space", + "pattern": r"booru\.bcbnsfw\.space", + }, +}) + + +class SzurubooruTagExtractor(SzurubooruExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}_{version}" + pattern = BASE_PATTERN + r"/posts/query=([^/?#]+)" + test = ( + ("https://booru.foalcon.com/posts/query=simple_background", { + "pattern": r"https://booru\.foalcon\.com/data/posts" + r"/\d+_[0-9a-f]{16}\.\w+", + "range": "1-150", + "count": 150, + }), + ("https://booru.bcbnsfw.space/posts/query=simple_background"), + ) + + def __init__(self, match): + SzurubooruExtractor.__init__(self, match) + query = match.group(match.lastindex) + self.query = text.unquote(query.replace("+", " ")) + + def metadata(self): + return {"search_tags": self.query} + + def posts(self): + return self._pagination("/posts/", {"query": self.query}) + + +class SzurubooruPostExtractor(SzurubooruExtractor): + subcategory = "post" + archive_fmt = "{id}_{version}" + pattern = BASE_PATTERN + r"/post/(\d+)" + test = ( + ("https://booru.foalcon.com/post/30092", { + "pattern": r"https://booru\.foalcon\.com/data/posts" + r"/30092_b7d56e941888b624\.png", + "url": "dad4d4c67d87cd9a4ac429b3414747c27a95d5cb", + "content": "86d1514c0ca8197950cc4b74e7a59b2dc76ebf9c", + }), + ("https://booru.bcbnsfw.space/post/1599", { + "pattern": r"https://booru\.bcbnsfw\.space/data/posts" + r"/1599_53784518e92086bd\.png", + "content": "0c38fc612ba1f03950fad31c4f80a1fccdab1096", + }), + ) + + def __init__(self, match): + SzurubooruExtractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + return (self._api_request("/post/" + self.post_id),) diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py index 5996268..116f3af 100644 --- a/gallery_dl/extractor/telegraph.py +++ b/gallery_dl/extractor/telegraph.py @@ -68,6 +68,21 @@ class TelegraphGalleryExtractor(GalleryExtractor): "title": "Всё о друзьях моей сестрицы", }, }), + ("https://telegra.ph/Disharmonica---Saber-Nero-02-21", { + "pattern": r"https://telegra\.ph/file/[0-9a-f]+\.(jpg|png)", + "keyword": { + "author": "cosmos", + "caption": "", + "count": 89, + "date": "dt:2022-02-21 05:57:39", + "description": "", + "num_formatted": r"re:^\d{2}$", + "post_url": "https://telegra.ph" + "/Disharmonica---Saber-Nero-02-21", + "slug": "Disharmonica---Saber-Nero-02-21", + "title": "Disharmonica - Saber Nero", + }, + }), ) def metadata(self, page): @@ -89,7 +104,8 @@ class TelegraphGalleryExtractor(GalleryExtractor): return data def images(self, page): - figures = tuple(text.extract_iter(page, "<figure>", "</figure>")) + figures = (tuple(text.extract_iter(page, "<figure>", "</figure>")) or + tuple(text.extract_iter(page, "<img", ">"))) num_zeroes = len(str(len(figures))) num = 0 @@ -105,7 +121,7 @@ class TelegraphGalleryExtractor(GalleryExtractor): result.append((url, { "url" : url, - "caption" : text.unescape(caption), + "caption" : text.unescape(caption) if caption else "", "num" : num, "num_formatted": str(num).zfill(num_zeroes), })) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index c75952a..155db1e 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -286,7 +286,11 @@ class TumblrUserExtractor(TumblrExtractor): "count": 3, "options": (("posts", "all"), ("external", True)) }), - ("https://mikf123-hidden.tumblr.com/", { # dashbord-only + ("https://mikf123-hidden.tumblr.com/", { # dashboard-only + "options": (("access-token", None),), + "exception": exception.AuthorizationError, + }), + ("https://mikf123-hidden.tumblr.com/", { # dashboard-only "count": 2, "keyword": {"tags": ["test", "hidden"]}, }), @@ -498,12 +502,24 @@ class TumblrAPI(oauth.OAuth1API): if 200 <= status < 400: return data["response"] + self.log.debug(data) if status == 403: raise exception.AuthorizationError() + elif status == 404: + try: + error = data["errors"][0]["detail"] + board = ("only viewable within the Tumblr dashboard" in error) + except Exception: + board = False + + if board: + self.log.info("Run 'gallery-dl oauth:tumblr' " + "to access dashboard-only blogs") + raise exception.AuthorizationError(error) raise exception.NotFoundError("user or post") - elif status == 429: + elif status == 429: # daily rate limit if response.headers.get("x-ratelimit-perday-remaining") == "0": self.log.info("Daily API rate limit exceeded") diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 17a2202..29b4ac3 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -41,6 +41,10 @@ class TwitterExtractor(Extractor): self.cards = self.config("cards", False) self.cards_blacklist = self.config("cards-blacklist") self.syndication = self.config("syndication") + + if not self.config("transform", True): + self._transform_user = util.identity + self._transform_tweet = util.identity self._user = self._user_obj = None self._user_cache = {} self._init_sizes() @@ -212,7 +216,7 @@ class TwitterExtractor(Extractor): files.append(value) return elif name == "unified_card": - data = json.loads(bvals["unified_card"]["string_value"]) + data = util.json_loads(bvals["unified_card"]["string_value"]) self._extract_media(tweet, data["media_entities"].values(), files) return @@ -1436,6 +1440,8 @@ class TwitterAPI(): if "retweeted_status_result" in legacy: retweet = legacy["retweeted_status_result"]["result"] + if "tweet" in retweet: + retweet = retweet["tweet"] if original_retweets: try: retweet["legacy"]["retweeted_status_id_str"] = \ diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 00389fa..053a799 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://vsco.co/""" from .common import Extractor, Message -from .. import text -import json +from .. import text, util BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)" @@ -69,7 +68,7 @@ class VscoExtractor(Extractor): def _extract_preload_state(self, url): page = self.request(url, notfound=self.subcategory).text - return json.loads(text.extr(page, "__PRELOADED_STATE__ = ", "<")) + return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<")) def _pagination(self, url, params, token, key, extra=None): headers = { diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index ab05c48..68bd136 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,10 +9,9 @@ """Extractors for https://www.weibo.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache import random -import json BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)" USER_PATTERN = BASE_PATTERN + r"/(?:(u|n|p(?:rofile)?)/)?([^/?#]+)(?:/home)?" @@ -179,7 +178,7 @@ class WeiboExtractor(Extractor): page = Extractor.request( self, passport_url, method="POST", headers=headers, data=data).text - data = json.loads(text.extr(page, "(", ");"))["data"] + data = util.json_loads(text.extr(page, "(", ");"))["data"] passport_url = "https://passport.weibo.com/visitor/visitor" params = { diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py index 70e9646..662e08b 100644 --- a/gallery_dl/extractor/wikifeet.py +++ b/gallery_dl/extractor/wikifeet.py @@ -7,8 +7,7 @@ """Extractors for https://www.wikifeet.com/""" from .common import GalleryExtractor -from .. import text -import json +from .. import text, util class WikifeetGalleryExtractor(GalleryExtractor): @@ -114,5 +113,5 @@ class WikifeetGalleryExtractor(GalleryExtractor): "height": data["ph"], "tags" : [tagmap[tag] for tag in data["tags"]], }) - for data in json.loads(text.extr(page, "['gdata'] = ", ";")) + for data in util.json_loads(text.extr(page, "['gdata'] = ", ";")) ] diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 0125739..b308e74 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -9,9 +9,7 @@ """Extractors for https://xhamster.com/""" from .common import Extractor, Message -from .. import text -import json - +from .. import text, util BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster" r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))") @@ -144,7 +142,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor): def _data(self, url): page = self.request(url).text - return json.loads(text.extr( + return util.json_loads(text.extr( page, "window.initials=", "</script>").rstrip("\n\r;")) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 10de439..46ea074 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -9,8 +9,7 @@ """Extractors for https://www.xvideos.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text -import json +from .. import text, util class XvideosBase(): @@ -113,7 +112,7 @@ class XvideosUserExtractor(XvideosBase, Extractor): def items(self): url = "{}/profiles/{}".format(self.root, self.user) page = self.request(url, notfound=self.subcategory).text - data = json.loads(text.extr( + data = util.json_loads(text.extr( page, "xv.conf=", ";</script>"))["data"] if not isinstance(data["galleries"], dict): |
