diff options
| author | 2025-10-07 02:11:52 -0400 | |
|---|---|---|
| committer | 2025-10-07 02:11:52 -0400 | |
| commit | 83e1e051b8c0e622ef5f61c1955c47b4bde95b57 (patch) | |
| tree | 544a434cb398d2adb8b8a2d553dc1c9a44b4ee1d /gallery_dl/extractor | |
| parent | f1612851ae9fe68c7444fb31e786503868aeaa7c (diff) | |
| parent | bbe7fac03d881662a458e7fbf870c9d71f5257f4 (diff) | |
Update upstream source from tag 'upstream/1.30.9'
Update to upstream version '1.30.9'
with Debian dir 46cc56e13f05f4465cc64f67b4d7b775a95bd87a
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/chevereto.py | 20 | ||||
| -rw-r--r-- | gallery_dl/extractor/imagehosts.py | 14 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 75 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangadex.py | 119 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangafire.py | 168 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangareader.py | 173 | ||||
| -rw-r--r-- | gallery_dl/extractor/misskey.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/nozomi.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/paheal.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/patreon.py | 37 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 44 | ||||
| -rw-r--r-- | gallery_dl/extractor/s3ndpics.py | 101 | ||||
| -rw-r--r-- | gallery_dl/extractor/schalenetwork.py | 57 | ||||
| -rw-r--r-- | gallery_dl/extractor/simpcity.py | 7 | ||||
| -rw-r--r-- | gallery_dl/extractor/thehentaiworld.py | 26 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/weibo.py | 36 | ||||
| -rw-r--r-- | gallery_dl/extractor/wikimedia.py | 34 | ||||
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 35 |
20 files changed, 801 insertions, 175 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index abdb6cc..a3df634 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -115,11 +115,13 @@ modules = [ "lynxchan", "madokami", "mangadex", + "mangafire", "mangafox", "mangahere", "manganelo", "mangapark", "mangaread", + "mangareader", "mangataro", "mangoxo", "misskey", @@ -166,6 +168,7 @@ modules = [ "rule34us", "rule34vault", "rule34xyz", + "s3ndpics", "saint", "sankaku", "sankakucomplex", diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 67fdb39..1552899 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -40,19 +40,15 @@ class CheveretoExtractor(BaseExtractor): BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { "root": "https://jpg6.su", - "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", - }, - "imgkiwi": { - "root": "https://img.kiwi", - "pattern": r"img\.kiwi", + "pattern": r"(?:www\.)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, "imagepond": { "root": "https://imagepond.net", - "pattern": r"imagepond\.net", + "pattern": r"(?:www\.)?imagepond\.net", }, "imglike": { "root": "https://imglike.com", - "pattern": r"imglike\.com", + "pattern": r"(?:www\.)?imglike\.com", }, }) @@ -79,7 +75,7 @@ class CheveretoImageExtractor(CheveretoExtractor): fromhex=True) file = { - "id" : self.path.rpartition(".")[2], + "id" : self.path.rpartition("/")[2].rpartition(".")[2], "url" : url, "album": text.remove_html(extr( "Added to <a", "</a>").rpartition(">")[2]), @@ -144,7 +140,8 @@ class CheveretoAlbumExtractor(CheveretoExtractor): def items(self): url = self.root + self.path - data = {"_extractor": CheveretoImageExtractor} + data_image = {"_extractor": CheveretoImageExtractor} + data_video = {"_extractor": CheveretoVideoExtractor} if self.path.endswith("/sub"): albums = self._pagination(url) @@ -152,8 +149,9 @@ class CheveretoAlbumExtractor(CheveretoExtractor): albums = (url,) for album in albums: - for image in self._pagination(album): - yield Message.Queue, image, data + for item_url in self._pagination(album): + data = data_video if "/video/" in item_url else data_image + yield Message.Queue, item_url, data class CheveretoCategoryExtractor(CheveretoExtractor): diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index fccc466..817d2c4 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -125,8 +125,18 @@ class ImxtoGalleryExtractor(ImagehostImageExtractor): "title": text.unescape(title.partition(">")[2]).strip(), } - for url in text.extract_iter(page, "<a href=", " ", pos): - yield Message.Queue, url.strip("\"'"), data + params = {"page": 1} + while True: + for url in text.extract_iter(page, "<a href=", " ", pos): + if "/i/" in url: + yield Message.Queue, url.strip("\"'"), data + + if 'class="pagination' not in page or \ + 'class="disabled">Last' in page: + return + + params["page"] += 1 + page = self.request(self.page_url, params=params).text class AcidimgImageExtractor(ImagehostImageExtractor): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 00e06b5..0e6c480 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -39,7 +39,6 @@ class InstagramExtractor(Extractor): self.www_claim = "0" self.csrf_token = util.generate_token() self._find_tags = util.re(r"#\w+").findall - self._warn_video_ua = True self._logged_in = True self._cursor = None self._user = None @@ -52,6 +51,12 @@ class InstagramExtractor(Extractor): else: self.api = InstagramRestAPI(self) + self._warn_video = True if self.config("warn-videos", True) else False + self._warn_image = ( + 9 if not (wi := self.config("warn-images", True)) else + 1 if wi in ("all", "both") else + 0) + def items(self): self.login() @@ -172,6 +177,7 @@ class InstagramExtractor(Extractor): "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), "post_url": post_url, + "type": "story" if expires else "highlight", } if "title" in post: data["highlight_title"] = post["title"] @@ -182,7 +188,6 @@ class InstagramExtractor(Extractor): data = { "post_id" : post["pk"], "post_shortcode": post["code"], - "post_url": f"{self.root}/p/{post['code']}/", "likes": post.get("like_count", 0), "liked": post.get("has_liked", False), "pinned": self._extract_pinned(post), @@ -239,8 +244,8 @@ class InstagramExtractor(Extractor): manifest = item.get("video_dash_manifest") media = video - if self._warn_video_ua: - self._warn_video_ua = False + if self._warn_video: + self._warn_video = False pattern = text.re( r"Chrome/\d{3,}\.\d+\.\d+\.\d+(?!\d* Mobile)") if not pattern.search(self.session.headers["User-Agent"]): @@ -250,8 +255,9 @@ class InstagramExtractor(Extractor): video = manifest = None media = image - if image["width"] < item.get("original_width", 0) or \ - image["height"] < item.get("original_height", 0): + if self._warn_image < ( + (image["width"] < item.get("original_width", 0)) + + (image["height"] < item.get("original_height", 0))): self.log.warning( "%s: Available image resolutions lower than the " "original (%sx%s < %sx%s). " @@ -278,7 +284,7 @@ class InstagramExtractor(Extractor): if manifest is not None: media["_ytdl_manifest_data"] = manifest if "owner" in item: - media["owner2"] = item["owner"] + media["owner"] = item["owner"] if "reshared_story_media_author" in item: media["author"] = item["reshared_story_media_author"] if "expiring_at" in item: @@ -287,6 +293,14 @@ class InstagramExtractor(Extractor): self._extract_tagged_users(item, media) files.append(media) + if "type" not in data: + if len(files) == 1 and files[0]["video_url"]: + data["type"] = "reel" + data["post_url"] = f"{self.root}/reel/{post['code']}/" + else: + data["type"] = "post" + data["post_url"] = f"{self.root}/p/{post['code']}/" + return data def _parse_post_graphql(self, post): @@ -443,6 +457,32 @@ class InstagramExtractor(Extractor): user[key] = 0 +class InstagramPostExtractor(InstagramExtractor): + """Extractor for an Instagram post""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?:share/()|[^/?#]+/)?(?:p|tv|reels?())/([^/?#]+)") + example = "https://www.instagram.com/p/abcdefg/" + + def __init__(self, match): + if match[2] is not None: + self.subcategory = "reel" + InstagramExtractor.__init__(self, match) + + def posts(self): + share, reel, shortcode = self.groups + if share is not None: + url = text.ensure_http_scheme(self.url) + headers = { + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "same-origin", + } + location = self.request_location(url, headers=headers) + shortcode = location.split("/")[-2] + return self.api.media(shortcode) + + class InstagramUserExtractor(Dispatch, InstagramExtractor): """Extractor for an Instagram user profile""" pattern = USER_PATTERN + r"/?(?:$|[?#])" @@ -740,27 +780,6 @@ class InstagramAvatarExtractor(InstagramExtractor): },) -class InstagramPostExtractor(InstagramExtractor): - """Extractor for an Instagram post""" - subcategory = "post" - pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?:share/()|[^/?#]+/)?(?:p|tv|reel)/([^/?#]+)") - example = "https://www.instagram.com/p/abcdefg/" - - def posts(self): - share, shortcode = self.groups - if share is not None: - url = text.ensure_http_scheme(self.url) - headers = { - "Sec-Fetch-Dest": "empty", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "same-origin", - } - location = self.request_location(url, headers=headers) - shortcode = location.split("/")[-2] - return self.api.media(shortcode) - - class InstagramRestAPI(): def __init__(self, extractor): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index fbed328..30d6848 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -39,7 +39,7 @@ class MangadexExtractor(Extractor): data = self._transform(chapter) data["_extractor"] = MangadexChapterExtractor self._cache[uuid] = data - yield Message.Queue, self.root + "/chapter/" + uuid, data + yield Message.Queue, f"{self.root}/chapter/{uuid}", data def _items_manga(self): data = {"_extractor": MangadexMangaExtractor} @@ -51,13 +51,8 @@ class MangadexExtractor(Extractor): relationships = defaultdict(list) for item in chapter["relationships"]: relationships[item["type"]].append(item) - manga = self.api.manga(relationships["manga"][0]["id"]) - for item in manga["relationships"]: - relationships[item["type"]].append(item) cattributes = chapter["attributes"] - mattributes = manga["attributes"] - if lang := cattributes.get("translatedLanguage"): lang = lang.partition("-")[0] @@ -66,35 +61,21 @@ class MangadexExtractor(Extractor): else: chnum, sep, minor = 0, "", "" - data = { - "manga" : (mattributes["title"].get("en") or - next(iter(mattributes["title"].values()))), - "manga_id": manga["id"], + return { + **_manga_info(self, relationships["manga"][0]["id"]), "title" : cattributes["title"], "volume" : text.parse_int(cattributes["volume"]), "chapter" : text.parse_int(chnum), - "chapter_minor": sep + minor, + "chapter_minor": f"{sep}{minor}", "chapter_id": chapter["id"], "date" : text.parse_datetime(cattributes["publishAt"]), + "group" : [group["attributes"]["name"] + for group in relationships["scanlation_group"]], "lang" : lang, - "language": util.code_to_language(lang), "count" : cattributes["pages"], "_external_url": cattributes.get("externalUrl"), } - data["artist"] = [artist["attributes"]["name"] - for artist in relationships["artist"]] - data["author"] = [author["attributes"]["name"] - for author in relationships["author"]] - data["group"] = [group["attributes"]["name"] - for group in relationships["scanlation_group"]] - - data["status"] = mattributes["status"] - data["tags"] = [tag["attributes"]["name"]["en"] - for tag in mattributes["tags"]] - - return data - class MangadexCoversExtractor(MangadexExtractor): """Extractor for mangadex manga covers""" @@ -103,7 +84,7 @@ class MangadexCoversExtractor(MangadexExtractor): filename_fmt = "{volume:>02}_{lang}.{extension}" archive_fmt = "c_{cover_id}" pattern = (rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)" - r"(?:/[^/?#]+)?\?tab=art") + rf"(?:/[^/?#]+)?\?tab=art") example = ("https://mangadex.org/title" "/01234567-89ab-cdef-0123-456789abcdef?tab=art") @@ -121,24 +102,10 @@ class MangadexCoversExtractor(MangadexExtractor): relationships = defaultdict(list) for item in cover["relationships"]: relationships[item["type"]].append(item) - manga = self.api.manga(relationships["manga"][0]["id"]) - for item in manga["relationships"]: - relationships[item["type"]].append(item) - cattributes = cover["attributes"] - mattributes = manga["attributes"] return { - "manga" : (mattributes["title"].get("en") or - next(iter(mattributes["title"].values()))), - "manga_id": manga["id"], - "status" : mattributes["status"], - "author" : [author["attributes"]["name"] - for author in relationships["author"]], - "artist" : [artist["attributes"]["name"] - for artist in relationships["artist"]], - "tags" : [tag["attributes"]["name"]["en"] - for tag in mattributes["tags"]], + **_manga_info(self, relationships["manga"][0]["id"]), "cover" : cattributes["fileName"], "lang" : cattributes.get("locale"), "volume" : text.parse_int(cattributes["volume"]), @@ -150,7 +117,7 @@ class MangadexCoversExtractor(MangadexExtractor): class MangadexChapterExtractor(MangadexExtractor): """Extractor for manga-chapters from mangadex.org""" subcategory = "chapter" - pattern = BASE_PATTERN + r"/chapter/([0-9a-f-]+)" + pattern = rf"{BASE_PATTERN}/chapter/([0-9a-f-]+)" example = ("https://mangadex.org/chapter" "/01234567-89ab-cdef-0123-456789abcdef") @@ -177,13 +144,13 @@ class MangadexChapterExtractor(MangadexExtractor): "page-reverse") else enumerate for data["page"], page in enum(chapter["data"], 1): text.nameext_from_url(page, data) - yield Message.Url, base + page, data + yield Message.Url, f"{base}{page}", data class MangadexMangaExtractor(MangadexExtractor): """Extractor for manga from mangadex.org""" subcategory = "manga" - pattern = BASE_PATTERN + r"/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)" + pattern = rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)" example = ("https://mangadex.org/title" "/01234567-89ab-cdef-0123-456789abcdef") @@ -194,7 +161,7 @@ class MangadexMangaExtractor(MangadexExtractor): class MangadexFeedExtractor(MangadexExtractor): """Extractor for chapters from your Updates Feed""" subcategory = "feed" - pattern = BASE_PATTERN + r"/titles?/feed$()" + pattern = rf"{BASE_PATTERN}/titles?/feed$()" example = "https://mangadex.org/title/feed" def chapters(self): @@ -204,7 +171,7 @@ class MangadexFeedExtractor(MangadexExtractor): class MangadexFollowingExtractor(MangadexExtractor): """Extractor for followed manga from your Library""" subcategory = "following" - pattern = BASE_PATTERN + r"/titles?/follows(?:\?([^#]+))?$" + pattern = rf"{BASE_PATTERN}/titles?/follows(?:\?([^#]+))?$" example = "https://mangadex.org/title/follows" items = MangadexExtractor._items_manga @@ -216,8 +183,8 @@ class MangadexFollowingExtractor(MangadexExtractor): class MangadexListExtractor(MangadexExtractor): """Extractor for mangadex MDLists""" subcategory = "list" - pattern = (BASE_PATTERN + - r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?") + pattern = (rf"{BASE_PATTERN}" + rf"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?") example = ("https://mangadex.org/list" "/01234567-89ab-cdef-0123-456789abcdef/NAME") @@ -242,7 +209,7 @@ class MangadexListExtractor(MangadexExtractor): class MangadexAuthorExtractor(MangadexExtractor): """Extractor for mangadex authors""" subcategory = "author" - pattern = BASE_PATTERN + r"/author/([0-9a-f-]+)" + pattern = rf"{BASE_PATTERN}/author/([0-9a-f-]+)" example = ("https://mangadex.org/author" "/01234567-89ab-cdef-0123-456789abcdef/NAME") @@ -280,30 +247,30 @@ class MangadexAPI(): else text.ensure_http_scheme(server).rstrip("/")) def athome_server(self, uuid): - return self._call("/at-home/server/" + uuid) + return self._call(f"/at-home/server/{uuid}") def author(self, uuid, manga=False): params = {"includes[]": ("manga",)} if manga else None - return self._call("/author/" + uuid, params)["data"] + return self._call(f"/author/{uuid}", params)["data"] def chapter(self, uuid): params = {"includes[]": ("scanlation_group",)} - return self._call("/chapter/" + uuid, params)["data"] + return self._call(f"/chapter/{uuid}", params)["data"] def covers_manga(self, uuid): params = {"manga[]": uuid} return self._pagination_covers("/cover", params) def list(self, uuid): - return self._call("/list/" + uuid, None, True)["data"] + return self._call(f"/list/{uuid}", None, True)["data"] def list_feed(self, uuid): - return self._pagination_chapters("/list/" + uuid + "/feed", None, True) + return self._pagination_chapters(f"/list/{uuid}/feed", None, True) @memcache(keyarg=1) def manga(self, uuid): params = {"includes[]": ("artist", "author")} - return self._call("/manga/" + uuid, params)["data"] + return self._call(f"/manga/{uuid}", params)["data"] def manga_author(self, uuid_author): params = {"authorOrArtist": uuid_author} @@ -315,7 +282,7 @@ class MangadexAPI(): "order[volume]" : order, "order[chapter]": order, } - return self._pagination_chapters("/manga/" + uuid + "/feed", params) + return self._pagination_chapters(f"/manga/{uuid}/feed", params) def user_follows_manga(self): params = {"contentRating": None} @@ -366,17 +333,17 @@ class MangadexAPI(): _refresh_token_cache.update( (username, "personal"), data["refresh_token"]) - return "Bearer " + access_token + return f"Bearer {access_token}" @cache(maxage=900, keyarg=1) def _authenticate_impl_legacy(self, username, password): if refresh_token := _refresh_token_cache(username): self.extractor.log.info("Refreshing access token") - url = self.root + "/auth/refresh" + url = f"{self.root}/auth/refresh" json = {"token": refresh_token} else: self.extractor.log.info("Logging in as %s", username) - url = self.root + "/auth/login" + url = f"{self.root}/auth/login" json = {"username": username, "password": password} self.extractor.log.debug("Using legacy login method") @@ -387,10 +354,10 @@ class MangadexAPI(): if refresh_token != data["token"]["refresh"]: _refresh_token_cache.update(username, data["token"]["refresh"]) - return "Bearer " + data["token"]["session"] + return f"Bearer {data['token']['session']}" def _call(self, endpoint, params=None, auth=False): - url = self.root + endpoint + url = f"{self.root}{endpoint}" headers = self.headers_auth if auth else self.headers while True: @@ -470,3 +437,33 @@ class MangadexAPI(): @cache(maxage=90*86400, keyarg=0) def _refresh_token_cache(username): return None + + +@memcache(keyarg=1) +def _manga_info(self, uuid): + manga = self.api.manga(uuid) + + rel = defaultdict(list) + for item in manga["relationships"]: + rel[item["type"]].append(item) + mattr = manga["attributes"] + + return { + "manga" : (mattr["title"].get("en") or + next(iter(mattr["title"].values()))), + "manga_id": manga["id"], + "manga_titles": [t.popitem()[1] + for t in mattr.get("altTitles") or ()], + "manga_date" : text.parse_datetime(mattr.get("createdAt")), + "description" : (mattr["description"].get("en") or + next(iter(mattr["description"].values()))), + "demographic": mattr.get("publicationDemographic"), + "origin": mattr.get("originalLanguage"), + "status": mattr.get("status"), + "year" : mattr.get("year"), + "rating": mattr.get("contentRating"), + "links" : mattr.get("links"), + "tags" : [tag["attributes"]["name"]["en"] for tag in mattr["tags"]], + "artist": [artist["attributes"]["name"] for artist in rel["artist"]], + "author": [author["attributes"]["name"] for author in rel["author"]], + } diff --git a/gallery_dl/extractor/mangafire.py b/gallery_dl/extractor/mangafire.py new file mode 100644 index 0000000..5ccb732 --- /dev/null +++ b/gallery_dl/extractor/mangafire.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangafire.to/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangafire\.to" + + +class MangafireBase(): + """Base class for mangafire extractors""" + category = "mangafire" + root = "https://mangafire.to" + + +class MangafireChapterExtractor(MangafireBase, ChapterExtractor): + """Extractor for mangafire manga chapters""" + directory_fmt = ( + "{category}", "{manga}", + "{volume:?v/ />02}{chapter:?c//>03}{chapter_minor:?//}{title:?: //}") + filename_fmt = ( + "{manga}{volume:?_v//>02}{chapter:?_c//>03}{chapter_minor:?//}_" + "{page:>03}.{extension}") + archive_fmt = ( + "{manga_id}_{chapter_id}_{page}") + pattern = (rf"{BASE_PATTERN}/read/([\w-]+\.(\w+))/([\w-]+)" + rf"/((chapter|volume)-\d+(?:\D.*)?)") + example = "https://mangafire.to/read/MANGA.ID/LANG/chapter-123" + + def metadata(self, _): + manga_path, manga_id, lang, chapter_info, self.type = self.groups + + try: + chapters = _manga_chapters(self, (manga_id, self.type, lang)) + anchor = chapters[chapter_info] + except KeyError: + raise exception.NotFoundError("chapter") + self.chapter_id = text.extr(anchor, 'data-id="', '"') + + return { + **_manga_info(self, manga_path), + **_chapter_info(anchor), + } + + def images(self, page): + url = f"{self.root}/ajax/read/{self.type}/{self.chapter_id}" + headers = {"x-requested-with": "XMLHttpRequest"} + data = self.request_json(url, headers=headers) + + return [ + (image[0], None) + for image in data["result"]["images"] + ] + + +class MangafireMangaExtractor(MangafireBase, MangaExtractor): + """Extractor for mangafire manga""" + chapterclass = MangafireChapterExtractor + pattern = rf"{BASE_PATTERN}/manga/([\w-]+)\.(\w+)" + example = "https://mangafire.to/manga/MANGA.ID" + + def chapters(self, page): + manga_slug, manga_id = self.groups + lang = self.config("lang") or "en" + + manga = _manga_info(self, f"{manga_slug}.{manga_id}") + chapters = _manga_chapters(self, (manga_id, "chapter", lang)) + + return [ + (f"""{self.root}{text.extr(anchor, 'href="', '"')}""", { + **manga, + **_chapter_info(anchor), + }) + for anchor in chapters.values() + ] + + +@memcache(keyarg=1) +def _manga_info(self, manga_path, page=None): + if page is None: + url = f"{self.root}/manga/{manga_path}" + page = self.request(url).text + slug, _, mid = manga_path.rpartition(".") + + extr = text.extract_from(page) + manga = { + "cover": text.extr(extr( + 'class="poster">', '</div>'), 'src="', '"'), + "status": extr("<p>", "<").replace("_", " ").title(), + "manga" : text.unescape(extr( + 'itemprop="name">', "<")), + "manga_id": mid, + "manga_slug": slug, + "manga_titles": text.unescape(extr( + "<h6>", "<")).split("; "), + "type": text.remove_html(extr( + 'class="min-info">', "</a>")), + "author": text.unescape(text.remove_html(extr( + "<span>Author:</span>", "</div>"))).split(" , "), + "published": text.remove_html(extr( + "<span>Published:</span>", "</div>")), + "tags": text.split_html(extr( + "<span>Genres:</span>", "</div>"))[::2], + "publisher": text.unescape(text.remove_html(extr( + "<span>Mangazines:</span>", "</div>"))).split(" , "), + "score": text.parse_float(text.remove_html(extr( + 'class="score">', " / "))), + "description": text.remove_html(extr( + 'id="synopsis">', "<script>")), + } + + if len(lst := manga["author"]) == 1 and not lst[0]: + manga["author"] = () + if len(lst := manga["publisher"]) == 1 and not lst[0]: + manga["publisher"] = () + + return manga + + +@memcache(keyarg=1) +def _manga_chapters(self, manga_info): + manga_id, type, lang = manga_info + url = f"{self.root}/ajax/read/{manga_id}/{type}/{lang}" + headers = {"x-requested-with": "XMLHttpRequest"} + data = self.request_json(url, headers=headers) + + needle = f"{manga_id}/{lang}/" + return { + text.extr(anchor, needle, '"'): anchor + for anchor in text.extract_iter(data["result"]["html"], "<a ", ">") + } + + +@memcache(keyarg=0) +def _chapter_info(info): + _, lang, chapter_info = text.extr(info, 'href="', '"').rsplit("/", 2) + + if chapter_info.startswith("vol"): + volume = text.extr(info, 'data-number="', '"') + volume_id = text.parse_int(text.extr(info, 'data-id="', '"')) + return { + "volume" : text.parse_int(volume), + "volume_id" : volume_id, + "chapter" : 0, + "chapter_minor" : "", + "chapter_string": chapter_info, + "chapter_id" : volume_id, + "title" : text.unescape(text.extr(info, 'title="', '"')), + "lang" : lang, + } + + chapter, sep, minor = text.extr(info, 'data-number="', '"').partition(".") + return { + "chapter" : text.parse_int(chapter), + "chapter_minor" : f"{sep}{minor}", + "chapter_string": chapter_info, + "chapter_id" : text.parse_int(text.extr(info, 'data-id="', '"')), + "title" : text.unescape(text.extr(info, 'title="', '"')), + "lang" : lang, + } diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py new file mode 100644 index 0000000..eb53998 --- /dev/null +++ b/gallery_dl/extractor/mangareader.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangareader.to/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, util +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangareader\.to" + + +class MangareaderBase(): + """Base class for mangareader extractors""" + category = "mangareader" + root = "https://mangareader.to" + + +class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): + """Extractor for mangareader manga chapters""" + directory_fmt = ( + "{category}", "{manga}", + "{volume:?v/ />02}{chapter:?c//>03}{chapter_minor:?//}{title:?: //}") + filename_fmt = ( + "{manga}{volume:?_v//>02}{chapter:?_c//>03}{chapter_minor:?//}_" + "{page:>03}.{extension}") + archive_fmt = ( + "{manga_id}_{chapter_id}_{page}") + pattern = (rf"{BASE_PATTERN}/read/([\w-]+-\d+)/([^/?#]+)" + rf"/(chapter|volume)-(\d+[^/?#]*)") + example = "https://mangareader.to/read/MANGA-123/LANG/chapter-123" + + def metadata(self, _): + path, lang, type, chstr = self.groups + + settings = util.json_dumps({ + "readingMode" : "vertical", + "readingDirection": "rtl", + "quality" : "high", + }) + self.cookies.set("mr_settings", settings, domain="mangareader.to") + + url = f"{self.root}/read/{path}/{lang}/{type}-{chstr}" + page = self.request(url).text + self.cid = cid = text.extr(page, 'data-reading-id="', '"') + + manga = _manga_info(self, path) + return { + **manga, + **manga[f"_{type}s"][lang][chstr], + "chapter_id": text.parse_int(cid), + } + + def images(self, page): + key = "chap" if self.groups[2] == "chapter" else "vol" + url = f"{self.root}/ajax/image/list/{key}/{self.cid}" + params = { + "mode" : "vertical,", + "quality" : "high,", + "hozPageSize": "1,", + } + headers = { + "X-Requested-With": "XMLHttpRequest", + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", + } + html = self.request_json(url, params=params, headers=headers)["html"] + + return [ + (url, None) + for url in text.extract_iter(html, 'data-url="', '"') + ] + + +class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): + """Extractor for mangareader manga""" + chapterclass = MangareaderChapterExtractor + pattern = rf"{BASE_PATTERN}/([\w-]+-\d+)" + example = "https://mangareader.to/MANGA-123" + + def chapters(self, page): + manga = _manga_info(self, self.groups[0]) + lang = self.config("lang") or "en" + + return [ + (info["chapter_url"], {**manga, **info}) + for info in manga["_chapters"][lang].values() + ] + + +@memcache(keyarg=1) +def _manga_info(self, manga_path): + url = f"{self.root}/{manga_path}" + html = self.request(url).text + + slug, _, mid = manga_path.rpartition("-") + extr = text.extract_from(html) + url = extr('property="og:url" content="', '"') + manga = { + "manga_url": url, + "manga_slug": url.rpartition("/")[2].rpartition("-")[0], + "manga_id": text.parse_int(mid), + "manga": text.unescape(extr('class="manga-name">', "<")), + "manga_alt": text.unescape(extr('class="manga-name-or">', "<")), + "tags": text.split_html(extr('class="genres">', "</div>")), + "type": text.remove_html(extr('>Type:', "</div>")), + "status": text.remove_html(extr('>Status:', "</div>")), + "author": text.split_html(extr('>Authors:', "</div>"))[0::2], + "published": text.remove_html(extr('>Published:', "</div>")), + "score": text.parse_float(text.remove_html(extr( + '>Score:', "</div>"))), + "views": text.parse_int(text.remove_html(extr( + '>Views:', "</div>")).replace(",", "")), + } + + base = self.root + + # extract all chapters + html = extr('class="chapters-list-ul">', " </div>") + manga["_chapters"] = chapters = {} + for group in text.extract_iter(html, "<ul", "</ul>"): + lang = text.extr(group, ' id="', '-chapters"') + + chapters[lang] = current = {} + lang = lang.partition("-")[0] + for ch in text.extract_iter(group, "<li ", "</li>"): + path = text.extr(ch, 'href="', '"') + chap = text.extr(ch, 'data-number="', '"') + name = text.unescape(text.extr(ch, 'class="name">', "<")) + + chapter, sep, minor = chap.partition(".") + current[chap] = { + "title" : name.partition(":")[2].strip(), + "chapter" : text.parse_int(chapter), + "chapter_minor" : f"{sep}{minor}", + "chapter_string": chap, + "chapter_url" : f"{base}{path}", + "lang" : lang, + } + + # extract all volumes + html = extr('class="volume-list-ul">', "</section>") + manga["_volumes"] = volumes = {} + for group in html.split('<div class="manga_list-wrap')[1:]: + lang = text.extr(group, ' id="', '-volumes"') + + volumes[lang] = current = {} + lang = lang.partition("-")[0] + for vol in text.extract_iter(group, 'class="item">', "</div>"): + path = text.extr(vol, 'href="', '"') + voln = text.extr(vol, 'tick-vol">', '<').rpartition(" ")[2] + + current[voln] = { + "volume" : text.parse_int(voln), + "volume_cover" : text.extr(vol, ' src="', '"'), + "chapter" : 0, + "chapter_minor" : "", + "chapter_string": voln, + "chapter_url" : f"{base}{path}", + "lang" : lang, + } + + # extract remaining metadata + manga["description"] = text.unescape(extr( + 'class="description-modal">', "</div>")).strip() + + return manga diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 5ff601a..42eaeef 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -25,8 +25,8 @@ class MisskeyExtractor(BaseExtractor): def _init(self): self.api = MisskeyAPI(self) self.instance = self.root.rpartition("://")[2] - self.renotes = self.config("renotes", False) - self.replies = self.config("replies", True) + self.renotes = True if self.config("renotes", False) else False + self.replies = True if self.config("replies", True) else False def items(self): for note in self.notes(): @@ -254,6 +254,8 @@ class MisskeyAPI(): def _pagination(self, endpoint, data): data["limit"] = 100 + data["withRenotes"] = self.extractor.renotes + while True: notes = self._call(endpoint, data) if not notes: diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 21c361c..528aff2 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -173,7 +173,7 @@ class NozomiSearchExtractor(NozomiExtractor): for tag in self.tags: (negative if tag[0] == "-" else positive).append( - tag.replace("/", "")) + text.quote(tag.replace("/", ""))) for tag in positive: ids = nozomi("nozomi/" + tag) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 5245f31..490243a 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -9,7 +9,7 @@ """Extractors for https://rule34.paheal.net/""" from .common import Extractor, Message -from .. import text +from .. import text, exception class PahealExtractor(Extractor): @@ -97,7 +97,12 @@ class PahealTagExtractor(PahealExtractor): base = f"{self.root}/post/list/{self.groups[0]}/" while True: - page = self.request(base + str(pnum)).text + try: + page = self.request(f"{base}{pnum}").text + except exception.HttpError as exc: + if exc.status == 404: + return + raise pos = page.find("id='image-list'") for post in text.extract_iter( @@ -146,4 +151,9 @@ class PahealPostExtractor(PahealExtractor): example = "https://rule34.paheal.net/post/view/12345" def get_posts(self): - return (self._extract_post(self.groups[0]),) + try: + return (self._extract_post(self.groups[0]),) + except exception.HttpError as exc: + if exc.status == 404: + return () + raise diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index fb2f32c..cf1a6d6 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -230,6 +230,16 @@ class PatreonExtractor(Extractor): attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z") return attr + def _collection(self, collection_id): + url = f"{self.root}/api/collection/{collection_id}" + data = self.request_json(url) + coll = data["data"] + attr = coll["attributes"] + attr["id"] = coll["id"] + attr["date"] = text.parse_datetime( + attr["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + return attr + def _filename(self, url): """Fetch filename from an URL's Content-Disposition header""" response = self.request(url, method="HEAD", fatal=False) @@ -333,6 +343,33 @@ class PatreonExtractor(Extractor): raise exception.AbortExtraction("Unable to extract bootstrap data") +class PatreonCollectionExtractor(PatreonExtractor): + """Extractor for a patreon collection""" + subcategory = "collection" + directory_fmt = ("{category}", "{creator[full_name]}", + "Collections", "{collection[title]} ({collection[id]})") + pattern = r"(?:https?://)?(?:www\.)?patreon\.com/collection/(\d+)" + example = "https://www.patreon.com/collection/12345" + + def posts(self): + collection_id = self.groups[0] + self.kwdict["collection"] = collection = \ + self._collection(collection_id) + campaign_id = text.extr( + collection["thumbnail"]["url"], "/campaign/", "/") + + url = self._build_url("posts", ( + # patreon returns '400 Bad Request' without campaign_id filter + f"&filter[campaign_id]={campaign_id}" + "&filter[contains_exclusive_posts]=true" + "&filter[is_draft]=false" + f"&filter[collection_id]={collection_id}" + "&filter[include_drops]=true" + "&sort=collection_order" + )) + return self._pagination(url) + + class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a72042c..6276a2a 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -1232,7 +1232,7 @@ class PixivAppAPI(): params = {"word": word, "search_target": target, "sort": sort, "duration": duration, "start_date": date_start, "end_date": date_end} - return self._pagination("/v1/search/illust", params) + return self._pagination_search("/v1/search/illust", params) def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): """Return illusts bookmarked by a user""" @@ -1322,6 +1322,48 @@ class PixivAppAPI(): params = text.parse_query(query) data = self._call(endpoint, params) + def _pagination_search(self, endpoint, params): + sort = params["sort"] + if sort == "date_desc": + date_key = "end_date" + date_off = timedelta(days=1) + date_cmp = lambda lhs, rhs: lhs >= rhs # noqa E731 + elif sort == "date_asc": + date_key = "start_date" + date_off = timedelta(days=-1) + date_cmp = lambda lhs, rhs: lhs <= rhs # noqa E731 + else: + date_key = None + date_last = None + + while True: + data = self._call(endpoint, params) + + if date_last is None: + yield from data["illusts"] + else: + works = data["illusts"] + if date_cmp(date_last, works[-1]["create_date"]): + for work in works: + if date_last is None: + yield work + elif date_cmp(date_last, work["create_date"]): + date_last = None + + if not (next_url := data.get("next_url")): + return + query = next_url.rpartition("?")[2] + params = text.parse_query(query) + + if date_key and text.parse_int(params.get("offset")) >= 5000: + date_last = data["illusts"][-1]["create_date"] + date_val = (text.parse_datetime( + date_last) + date_off).strftime("%Y-%m-%d") + self.log.info("Reached 'offset' >= 5000; " + "Updating '%s' to '%s'", date_key, date_val) + params[date_key] = date_val + params.pop("offset", None) + @cache(maxage=36500*86400, keyarg=0) def _refresh_token_cache(username): diff --git a/gallery_dl/extractor/s3ndpics.py b/gallery_dl/extractor/s3ndpics.py new file mode 100644 index 0000000..215f160 --- /dev/null +++ b/gallery_dl/extractor/s3ndpics.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://s3nd.pics/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?s3nd\.pics" + + +class S3ndpicsExtractor(Extractor): + """Base class for s3ndpics extractors""" + category = "s3ndpics" + root = "https://s3nd.pics" + root_api = f"{root}/api" + directory_fmt = ("{category}", "{user[username]}", + "{date} {title:?/ /}({id})") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + + def items(self): + base = "https://s3.s3nd.pics/s3nd-pics/" + + for post in self.posts(): + post["id"] = post.pop("_id", None) + post["user"] = post.pop("userId", None) + post["date"] = text.parse_datetime( + post["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["date_updated"] = text.parse_datetime( + post["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + files = post.pop("files", ()) + post["count"] = len(files) + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post["type"] = file["type"] + path = file["url"] + text.nameext_from_url(path, post) + yield Message.Url, f"{base}{path}", post + + def _pagination(self, url, params): + params["page"] = 1 + + while True: + data = self.request_json(url, params=params) + + self.kwdict["total"] = data["pagination"]["total"] + yield from data["posts"] + + if params["page"] >= data["pagination"]["pages"]: + return + params["page"] += 1 + + +class S3ndpicsPostExtractor(S3ndpicsExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/post/([0-9a-f]+)" + example = "https://s3nd.pics/post/0123456789abcdef01234567" + + def posts(self): + url = f"{self.root_api}/posts/{self.groups[0]}" + return (self.request_json(url)["post"],) + + +class S3ndpicsUserExtractor(S3ndpicsExtractor): + subcategory = "user" + pattern = rf"{BASE_PATTERN}/user/(\w+)" + example = "https://s3nd.pics/user/USER" + + def posts(self): + url = f"{self.root_api}/users/username/{self.groups[0]}" + self.kwdict["user"] = user = self.request_json(url)["user"] + + url = f"{self.root_api}/posts" + params = { + "userId": user["_id"], + "limit" : "12", + "sortBy": "newest", + } + return self._pagination(url, params) + + +class S3ndpicsSearchExtractor(S3ndpicsExtractor): + subcategory = "search" + pattern = rf"{BASE_PATTERN}/search/?\?([^#]+)" + example = "https://s3nd.pics/search?QUERY" + + def posts(self): + url = f"{self.root_api}/posts" + params = text.parse_query(self.groups[0]) + params.setdefault("limit", "20") + self.kwdict["search_tags"] = \ + params.get("tag") or params.get("tags") or params.get("q") + return self._pagination(url, params) diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py index dc42417..a4ef3b0 100644 --- a/gallery_dl/extractor/schalenetwork.py +++ b/gallery_dl/extractor/schalenetwork.py @@ -62,10 +62,11 @@ class SchalenetworkExtractor(Extractor): pass params["page"] += 1 - def _token(self): + def _token(self, required=True): if token := self.config("token"): return f"Bearer {token.rpartition(' ')[2]}" - raise exception.AuthRequired("'token'", "your favorites") + if required: + raise exception.AuthRequired("'token'", "your favorites") def _crt(self): crt = self.config("crt") @@ -88,7 +89,7 @@ class SchalenetworkExtractor(Extractor): else: msg = f"{exc.status} {exc.response.reason}" raise exception.AuthRequired( - "'crt' query parameter & matching '--user-agent'", None, msg) + "'crt' query parameter & matching 'user-agent'", None, msg) class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): @@ -114,19 +115,26 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): 10: "mixed", 11: "language", 12: "other", + 13: "reclass", } def metadata(self, _): _, gid, gkey = self.groups + url = f"{self.root_api}/books/detail/{gid}/{gkey}" - data = self.request_json(url, headers=self.headers) - data["date"] = text.parse_timestamp(data["created_at"] // 1000) + headers = self.headers + data = self.request_json(url, headers=headers) + + try: + data["date"] = text.parse_timestamp(data["created_at"] // 1000) + data["count"] = len(data["thumbnails"]["entries"]) + del data["thumbnails"] + except Exception: + pass tags = [] types = self.TAG_TYPES - tags_data = data["tags"] - - for tag in tags_data: + for tag in data["tags"]: name = tag["name"] namespace = tag.get("namespace", 0) tags.append(types[namespace] + ":" + name) @@ -134,33 +142,34 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): if self.config("tags", False): tags = collections.defaultdict(list) - for tag in tags_data : + for tag in data["tags"]: tags[tag.get("namespace", 0)].append(tag["name"]) for type, values in tags.items(): data["tags_" + types[type]] = values + url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={self._crt()}" + if token := self._token(False): + headers = headers.copy() + headers["Authorization"] = token try: - data["count"] = len(data["thumbnails"]["entries"]) - del data["thumbnails"] - except Exception: - pass + data_fmt = self.request_json( + url, method="POST", headers=headers) + except exception.HttpError as exc: + self._require_auth(exc) + + self.fmt = self._select_format(data_fmt["data"]) + data["source"] = data_fmt.get("source") return data def images(self, _): - crt = self._crt() _, gid, gkey = self.groups - url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={crt}" - try: - data = self.request_json(url, method="POST", headers=self.headers) - except exception.HttpError as exc: - self._require_auth(exc) - - fmt = self._select_format(data["data"]) + fmt = self.fmt url = (f"{self.root_api}/books/data/{gid}/{gkey}" - f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={crt}") - data = self.request_json(url, headers=self.headers) + f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={self._crt()}") + headers = self.headers + data = self.request_json(url, headers=headers) base = data["base"] results = [] @@ -169,7 +178,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): info = { "width" : dimensions[0], "height": dimensions[1], - "_http_headers": self.headers, + "_http_headers": headers, } results.append((base + entry["path"], info)) return results diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py index 3354289..d8227fa 100644 --- a/gallery_dl/extractor/simpcity.py +++ b/gallery_dl/extractor/simpcity.py @@ -92,7 +92,7 @@ class SimpcityExtractor(Extractor): author = schema["author"] stats = schema["interactionStatistic"] url_t = schema["url"] - url_a = author["url"] + url_a = author.get("url") or "" thread = { "id" : url_t[url_t.rfind(".")+1:-1], @@ -104,8 +104,9 @@ class SimpcityExtractor(Extractor): "tags" : (schema["keywords"].split(", ") if "keywords" in schema else ()), "section" : schema["articleSection"], - "author" : author["name"], - "author_id" : url_a[url_a.rfind(".")+1:-1], + "author" : author.get("name") or "", + "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else + (author.get("name") or "")[15:]), "author_url": url_a, } diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py index 055d7d8..9a30654 100644 --- a/gallery_dl/extractor/thehentaiworld.py +++ b/gallery_dl/extractor/thehentaiworld.py @@ -60,14 +60,16 @@ class ThehentaiworldExtractor(Extractor): "<li>Posted: ", "<"), "%Y-%m-%d"), } - if "/videos/" in url: + if (c := url[27]) == "v": post["type"] = "video" post["width"] = post["height"] = 0 post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) post["score"] = text.parse_float(extr("<strong>", "<")) post["file_url"] = extr('<source src="', '"') else: - post["type"] = "image" + post["type"] = ("animated" if c == "g" else + "3d cgi" if c == "3" else + "image") post["width"] = text.parse_int(extr("<li>Size: ", " ")) post["height"] = text.parse_int(extr("x ", "<")) post["file_url"] = extr('a href="', '"') @@ -109,16 +111,6 @@ class ThehentaiworldExtractor(Extractor): pnum += 1 -class ThehentaiworldPostExtractor(ThehentaiworldExtractor): - subcategory = "post" - pattern = (rf"{BASE_PATTERN}" - rf"(/(?:(?:3d-cgi-)?hentai-image|video)s/([^/?#]+))") - example = "https://thehentaiworld.com/hentai-images/SLUG/" - - def posts(self): - return (f"{self.root}{self.groups[0]}/",) - - class ThehentaiworldTagExtractor(ThehentaiworldExtractor): subcategory = "tag" per_page = 24 @@ -137,3 +129,13 @@ class ThehentaiworldTagExtractor(ThehentaiworldExtractor): self.page_start += pages self.post_start += posts return num + + +class ThehentaiworldPostExtractor(ThehentaiworldExtractor): + subcategory = "post" + pattern = (rf"{BASE_PATTERN}(" + rf"/(?:video|(?:[\w-]+-)?hentai-image)s/([^/?#]+))") + example = "https://thehentaiworld.com/hentai-images/SLUG/" + + def posts(self): + return (f"{self.root}{self.groups[0]}/",) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index e6c84d1..e7df4a3 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1026,11 +1026,12 @@ class TwitterTweetExtractor(TwitterExtractor): return while True: + parent_id = tweet["rest_id"] tweet_id = tweet["legacy"].get("quoted_status_id_str") if not tweet_id: break tweet = self.api.tweet_result_by_rest_id(tweet_id) - tweet["legacy"]["quoted_by_id_str"] = tweet_id + tweet["legacy"]["quoted_by_id_str"] = parent_id yield tweet def _tweets_detail(self, tweet_id): diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 823e8e0..07bed79 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -86,16 +86,25 @@ class WeiboExtractor(Extractor): status["count"] = len(files) yield Message.Directory, status - for num, file in enumerate(files, 1): - if file["url"].startswith("http:"): - file["url"] = "https:" + file["url"][5:] + num = 0 + for file in files: + url = file["url"] + if not url: + continue + if url.startswith("http:"): + url = f"https:{url[5:]}" if "filename" not in file: - text.nameext_from_url(file["url"], file) + text.nameext_from_url(url, file) if file["extension"] == "json": file["extension"] = "mp4" + if file["extension"] == "m3u8": + url = f"ytdl:{url}" + file["_ytdl_manifest"] = "hls" + file["extension"] = "mp4" + num += 1 file["status"] = status file["num"] = num - yield Message.Url, file["url"], file + yield Message.Url, url, file def _extract_status(self, status, files): if "mix_media_info" in status: @@ -143,10 +152,21 @@ class WeiboExtractor(Extractor): media = max(info["playback_list"], key=lambda m: m["meta"]["quality_index"]) except Exception: - return {"url": (info.get("stream_url_hd") or - info.get("stream_url") or "")} + video = {"url": (info.get("replay_hd") or + info.get("stream_url_hd") or + info.get("stream_url") or "")} else: - return media["play_info"].copy() + video = media["play_info"].copy() + + if "//wblive-out." in video["url"] and \ + not text.ext_from_url(video["url"]): + try: + video["url"] = self.request_location(video["url"]) + except exception.HttpError as exc: + self.log.warning("%s: %s", exc.__class__.__name__, exc) + video["url"] = "" + + return video def _status_by_id(self, status_id): url = f"{self.root}/ajax/statuses/show?id={status_id}" diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 00266bd..5ba47d2 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -46,6 +46,12 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = None + # note: image revisions are different from page revisions + # ref: + # https://www.mediawiki.org/wiki/API:Revisions + # https://www.mediawiki.org/wiki/API:Imageinfo + self.image_revisions = self.config("image-revisions", 1) + @cache(maxage=36500*86400, keyarg=1) def _search_api_path(self, root): self.log.debug("Probing possible API endpoints") @@ -56,7 +62,10 @@ class WikimediaExtractor(BaseExtractor): return url raise exception.AbortExtraction("Unable to find API endpoint") - def prepare(self, image): + def prepare_info(self, info): + """Adjust the content of an image info object""" + + def prepare_image(self, image): """Adjust the content of an image object""" image["metadata"] = { m["name"]: m["value"] @@ -74,14 +83,19 @@ class WikimediaExtractor(BaseExtractor): def items(self): for info in self._pagination(self.params): try: - image = info["imageinfo"][0] - except LookupError: + images = info.pop("imageinfo") + except KeyError: self.log.debug("Missing 'imageinfo' for %s", info) - continue + images = () + + info["count"] = len(images) + self.prepare_info(info) + yield Message.Directory, info - self.prepare(image) - yield Message.Directory, image - yield Message.Url, image["url"], image + for info["num"], image in enumerate(images, 1): + self.prepare_image(image) + image.update(info) + yield Message.Url, image["url"], image if self.subcategories: base = self.root + "/wiki/" @@ -108,6 +122,7 @@ class WikimediaExtractor(BaseExtractor): "timestamp|user|userid|comment|canonicaltitle|url|size|" "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth" ) + params["iilimit"] = self.image_revisions while True: data = self.request_json(url, params=params) @@ -237,9 +252,8 @@ class WikimediaArticleExtractor(WikimediaExtractor): "titles" : path, } - def prepare(self, image): - WikimediaExtractor.prepare(self, image) - image["page"] = self.title + def prepare_info(self, info): + info["page"] = self.title class WikimediaWikiExtractor(WikimediaExtractor): diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index e1b4897..98c9331 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -26,6 +26,7 @@ class ZerochanExtractor(BooruExtractor): per_page = 250 cookies_domain = ".zerochan.net" cookies_names = ("z_id", "z_hash") + useragent = util.USERAGENT request_interval = (0.5, 1.5) def login(self): @@ -192,7 +193,13 @@ class ZerochanTagExtractor(ZerochanExtractor): metadata = self.config("metadata") while True: - page = self.request(url, params=params, expected=(500,)).text + try: + page = self.request( + url, params=params, expected=(500,)).text + except exception.HttpError as exc: + if exc.status == 404: + return + raise thumbs = text.extr(page, '<ul id="thumbs', '</ul>') extr = text.extract_from(thumbs) @@ -231,7 +238,13 @@ class ZerochanTagExtractor(ZerochanExtractor): } while True: - response = self.request(url, params=params, allow_redirects=False) + try: + response = self.request( + url, params=params, allow_redirects=False) + except exception.HttpError as exc: + if exc.status == 404: + return + raise if response.status_code >= 300: url = text.urljoin(self.root, response.headers["location"]) @@ -275,12 +288,18 @@ class ZerochanImageExtractor(ZerochanExtractor): pattern = BASE_PATTERN + r"/(\d+)" example = "https://www.zerochan.net/12345" - def __init__(self, match): - ZerochanExtractor.__init__(self, match) - self.image_id = match[1] - def posts(self): - post = self._parse_entry_html(self.image_id) + image_id = self.groups[0] + + try: + post = self._parse_entry_html(image_id) + except exception.HttpError as exc: + if exc.status in (404, 410): + if msg := text.extr(exc.response.text, "<h2>", "<"): + self.log.warning(f"'{msg}'") + return () + raise + if self.config("metadata"): - post.update(self._parse_entry_api(self.image_id)) + post.update(self._parse_entry_api(image_id)) return (post,) |
