diff options
| author | 2025-09-23 07:44:37 -0400 | |
|---|---|---|
| committer | 2025-09-23 07:44:37 -0400 | |
| commit | 42b62671fabfdcf983a9575221420d85f7fbcac1 (patch) | |
| tree | fa6b2af249a7216aae5c70a926c6d08be1ac55a6 /gallery_dl | |
| parent | 3b7f8716690b7aa1994a9cb387bbc7215e01a4ed (diff) | |
New upstream version 1.30.8.upstream/1.30.8
Diffstat (limited to 'gallery_dl')
30 files changed, 933 insertions, 236 deletions
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index f5bb7b7..912a251 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -4,37 +4,41 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://2ch.hk/""" +"""Extractors for https://2ch.su/""" from .common import Extractor, Message from .. import text, util +BASE_PATTERN = r"(?:https?://)?2ch\.(su|life|hk)" + class _2chThreadExtractor(Extractor): """Extractor for 2ch threads""" category = "2ch" subcategory = "thread" - root = "https://2ch.hk" + root = "https://2ch.su" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{tim}{filename:? //}.{extension}" archive_fmt = "{board}_{thread}_{tim}" - pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" - example = "https://2ch.hk/a/res/12345.html" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)" + example = "https://2ch.su/a/res/12345.html" def __init__(self, match): + tld = match[1] + self.root = f"https://2ch.{'su' if tld == 'hk' else tld}" Extractor.__init__(self, match) - self.board, self.thread = match.groups() def items(self): - url = f"{self.root}/{self.board}/res/{self.thread}.json" + _, board, thread = self.groups + url = f"{self.root}/{board}/res/{thread}.json" posts = self.request_json(url)["threads"][0]["posts"] op = posts[0] title = op.get("subject") or text.remove_html(op["comment"]) thread = { - "board" : self.board, - "thread": self.thread, + "board" : board, + "thread": thread, "title" : text.unescape(title)[:50], } @@ -61,16 +65,17 @@ class _2chBoardExtractor(Extractor): """Extractor for 2ch boards""" category = "2ch" subcategory = "board" - root = "https://2ch.hk" - pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" - example = "https://2ch.hk/a/" + root = "https://2ch.su" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$" + example = "https://2ch.su/a/" def __init__(self, match): + tld = match[1] + self.root = f"https://2ch.{'su' if tld == 'hk' else tld}" Extractor.__init__(self, match) - self.board = match[1] def items(self): - base = f"{self.root}/{self.board}" + base = f"{self.root}/{self.groups[1]}" # index page url = f"{base}/index.json" diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index c9be2a4..4c43464 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -62,7 +62,8 @@ class _4archiveThreadExtractor(Extractor): data = { "name": extr('class="name">', "</span>"), "date": text.parse_datetime( - extr('class="dateTime postNum" >', "<").strip(), + (extr('class="dateTime">', "<") or + extr('class="dateTime postNum" >', "<")).strip(), "%Y-%m-%d %H:%M:%S"), "no" : text.parse_int(extr(">Post No.", "<")), } @@ -70,8 +71,7 @@ class _4archiveThreadExtractor(Extractor): extr('class="fileText"', ">File: <a") data.update({ "url" : extr('href="', '"'), - "filename": extr( - 'rel="noreferrer noopener"', "</a>").strip()[1:], + "filename": extr('alt="Image: ', '"'), "size" : text.parse_bytes(extr(" (", ", ")[:-1]), "width" : text.parse_int(extr("", "x")), "height" : text.parse_int(extr("", "px")), diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b32fcd1..abdb6cc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -73,6 +73,7 @@ modules = [ "girlswithmuscle", "gofile", "hatenablog", + "hdoujin", "hentai2read", "hentaicosplays", "hentaifoundry", @@ -88,6 +89,7 @@ modules = [ "imagefap", "imgbb", "imgbox", + "imgpile", "imgth", "imgur", "imhentai", @@ -118,6 +120,7 @@ modules = [ "manganelo", "mangapark", "mangaread", + "mangataro", "mangoxo", "misskey", "motherless", @@ -188,6 +191,7 @@ modules = [ "tcbscans", "telegraph", "tenor", + "thehentaiworld", "tiktok", "tmohentai", "toyhouse", diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py index 5c9b9cd..5dcb6a5 100644 --- a/gallery_dl/extractor/bellazon.py +++ b/gallery_dl/extractor/bellazon.py @@ -20,32 +20,61 @@ class BellazonExtractor(Extractor): root = "https://www.bellazon.com/main" directory_fmt = ("{category}", "{thread[section]}", "{thread[title]} ({thread[id]})") - filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}" - archive_fmt = "{post[id]}/{filename}" + filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}" + archive_fmt = "{post[id]}/{id}_{filename}" def items(self): - extract_urls = text.re(r'<a ([^>]*?href="([^"]+)".*?)</a>').findall - native = f"{self.root}/" + native = (f"{self.root}/", f"{self.root[6:]}/") + extract_urls = text.re( + r'(?s)<(' + r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>' + r'|img [^>]*?src="([^"]+)"[^>]*>' + r')' + ).findall + + if self.config("quoted", False): + strip_quoted = None + else: + strip_quoted = text.re(r"(?s)<blockquote .*?</blockquote>").sub for post in self.posts(): - urls = extract_urls(post["content"]) + if strip_quoted is None: + urls = extract_urls(post["content"]) + else: + urls = extract_urls(strip_quoted("", post["content"])) + data = {"post": post} post["count"] = data["count"] = len(urls) yield Message.Directory, data - for data["num"], (info, url) in enumerate(urls, 1): - url = text.unescape(url) + data["num"] = 0 + for info, url, url_img in urls: + url = text.unescape(url or url_img) + if url.startswith(native): + if "/uploads/emoticons/" in url or "/profile/" in url: + continue + data["num"] += 1 if not (alt := text.extr(info, ' alt="', '"')) or ( alt.startswith("post-") and "_thumb." in alt): name = url else: name = text.unescape(alt) + dc = text.nameext_from_url(name, data.copy()) dc["id"] = text.extr(info, 'data-fileid="', '"') if ext := text.extr(info, 'data-fileext="', '"'): dc["extension"] = ext + elif "/core/interface/file/attachment.php" in url: + if not dc["id"]: + dc["id"] = url.rpartition("?id=")[2] + if name := text.extr(info, ">", "<").strip(): + text.nameext_from_url(name, dc) + + if url[0] == "/": + url = f"https:{url}" yield Message.Url, url, dc + else: yield Message.Queue, url, data @@ -70,6 +99,28 @@ class BellazonExtractor(Extractor): pnum += 1 url = f"{base}/page/{pnum}/" + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page/9999/" # force redirect to highest page number + with self.request(url) as response: + parts = response.url.rsplit("/", 3) + pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1 + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page/{pnum}/" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request(url).text + def _parse_thread(self, page): schema = self._extract_jsonld(page) author = schema["author"] @@ -88,7 +139,7 @@ class BellazonExtractor(Extractor): "posts": stats[1]["userInteractionCount"], "date" : text.parse_datetime(schema["datePublished"]), "date_updated": text.parse_datetime(schema["dateModified"]), - "description" : text.unescape(schema["text"]), + "description" : text.unescape(schema["text"]).strip(), "section" : path[-2], "author" : author["name"], "author_url" : url_a, @@ -123,7 +174,7 @@ class BellazonExtractor(Extractor): class BellazonPostExtractor(BellazonExtractor): subcategory = "post" pattern = (rf"{BASE_PATTERN}(/topic/\d+-[\w-]+(?:/page/\d+)?)" - rf"/?#findComment-(\d+)") + rf"/?#(?:findC|c)omment-(\d+)") example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345" def posts(self): @@ -145,10 +196,22 @@ class BellazonThreadExtractor(BellazonExtractor): example = "https://www.bellazon.com/main/topic/123-SLUG/" def posts(self): - for page in self._pagination(*self.groups): + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(*self.groups) + reverse = False + else: + pages = self._pagination_reverse(*self.groups) + reverse = True + + for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) - for html in text.extract_iter(page, "<article ", "</article>"): + posts = text.extract_iter(page, "<article ", "</article>") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: yield self._parse_post(html) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index cf5bce1..14ebc48 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -162,7 +162,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): file["name"] = util.json_loads(text.extr( item, 'original:', ',\n').replace("\\'", "'")) file["slug"] = util.json_loads(text.extr( - item, 'slug: ', ',\n')) + item, 'slug: ', ',\n').replace("\\'", "'")) file["uuid"] = text.extr( item, 'name: "', ".") file["size"] = text.parse_int(text.extr( diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 6ba4d08..67fdb39 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -50,6 +50,10 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://imagepond.net", "pattern": r"imagepond\.net", }, + "imglike": { + "root": "https://imglike.com", + "pattern": r"imglike\.com", + }, }) @@ -152,6 +156,18 @@ class CheveretoAlbumExtractor(CheveretoExtractor): yield Message.Queue, image, data +class CheveretoCategoryExtractor(CheveretoExtractor): + """Extractor for chevereto galleries""" + subcategory = "category" + pattern = BASE_PATTERN + r"(/category/[^/?#]+)" + example = "https://imglike.com/category/TITLE" + + def items(self): + data = {"_extractor": CheveretoImageExtractor} + for image in self._pagination(self.root + self.path): + yield Message.Queue, image, data + + class CheveretoUserExtractor(CheveretoExtractor): """Extractor for chevereto users""" subcategory = "user" diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index f8ad07a..29c7763 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -278,6 +278,23 @@ class DanbooruTagExtractor(DanbooruExtractor): return self._pagination("/posts.json", {"tags": self.tags}, prefix) +class DanbooruRandomExtractor(DanbooruTagExtractor): + """Extractor for a random danbooru post""" + subcategory = "random" + pattern = BASE_PATTERN + r"/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?" + example = "https://danbooru.donmai.us/posts/random?tags=TAG" + + def metadata(self): + tags = self.groups[-1] or "" + self.tags = text.unquote(tags.replace("+", " ")) + return {"search_tags": self.tags} + + def posts(self): + posts = self.request_json(self.root + "/posts/random.json", + params={"tags": self.tags or None}) + return (posts,) if isinstance(posts, dict) else posts + + class DanbooruPoolExtractor(DanbooruExtractor): """Extractor for Danbooru pools""" subcategory = "pool" diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index bf24941..6061737 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -369,6 +369,16 @@ class FacebookExtractor(Extractor): for edge in (user["profile_tabs"]["profile_user"] ["timeline_nav_app_sections"]["edges"]) ] + + if bio := text.extr(page, '"best_description":{"text":"', '"'): + user["biography"] = self.decode_all(bio) + elif (pos := page.find( + '"__module_operation_ProfileCometTileView_profileT')) >= 0: + user["biography"] = self.decode_all(text.rextr( + page, '"text":"', '"', pos)) + else: + user["biography"] = text.unescape(text.remove_html(text.extr( + page, "</span></span></h2>", "<ul>"))) except Exception: if user is None: self.log.debug("Failed to extract user data: %s", data) diff --git a/gallery_dl/extractor/hdoujin.py b/gallery_dl/extractor/hdoujin.py new file mode 100644 index 0000000..080b899 --- /dev/null +++ b/gallery_dl/extractor/hdoujin.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hdoujin.org/""" + +from . import schalenetwork + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?(hdoujin\.(?:org|net))" + + +class HdoujinBase(): + """Base class for hdoujin extractors""" + category = "hdoujin" + root = "https://hdoujin.org" + root_api = "https://api.hdoujin.org" + root_auth = "https://auth.hdoujin.org" + + +class HdoujinGalleryExtractor( + HdoujinBase, schalenetwork.SchalenetworkGalleryExtractor): + pattern = rf"{BASE_PATTERN}/(?:g|reader)/(\d+)/(\w+)" + example = "https://hdoujin.org/g/12345/67890abcdef/" + + +class HdoujinSearchExtractor( + HdoujinBase, schalenetwork.SchalenetworkSearchExtractor): + pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+)|browse)?(?:/?\?([^#]*))?$" + example = "https://hdoujin.org/browse?s=QUERY" + + +class HdoujinFavoriteExtractor( + HdoujinBase, schalenetwork.SchalenetworkFavoriteExtractor): + pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?" + example = "https://hdoujin.org/favorites" + + +HdoujinBase.extr_class = HdoujinGalleryExtractor diff --git a/gallery_dl/extractor/imgpile.py b/gallery_dl/extractor/imgpile.py new file mode 100644 index 0000000..9fc3a9c --- /dev/null +++ b/gallery_dl/extractor/imgpile.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://imgpile.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgpile\.com" + + +class ImgpileExtractor(Extractor): + """Base class for imgpile extractors""" + category = "imgpile" + root = "https://imgpile.com" + directory_fmt = ("{category}", "{post[author]}", + "{post[title]} ({post[id_slug]})") + archive_fmt = "{post[id_slug]}_{id}" + + def items(self): + pass + + +class ImgpilePostExtractor(ImgpileExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/p/(\w+)" + example = "https://imgpile.com/p/AbCdEfG" + + def items(self): + post_id = self.groups[0] + url = f"{self.root}/p/{post_id}" + page = self.request(url).text + extr = text.extract_from(page) + + post = { + "id_slug": post_id, + "title" : text.unescape(extr("<title>", " - imgpile<")), + "id" : text.parse_int(extr('data-post-id="', '"')), + "author" : extr('/u/', '"'), + "score" : text.parse_int(text.remove_html(extr( + 'class="post-score">', "</"))), + "views" : text.parse_int(extr( + 'class="meta-value">', "<").replace(",", "")), + "tags" : text.split_html(extr( + " <!-- Tags -->", '<!-- "')), + } + + files = self._extract_files(extr) + data = {"post": post} + data["count"] = post["count"] = len(files) + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + data.update(file) + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, data) + + def _extract_files(self, extr): + files = [] + + while True: + media = extr('lass="post-media', '</div>') + if not media: + break + files.append({ + "id_slug": text.extr(media, 'data-id="', '"'), + "id" : text.parse_int(text.extr( + media, 'data-media-id="', '"')), + "url": f"""http{text.extr(media, '<a href="http', '"')}""", + }) + return files + + +class ImgpileUserExtractor(ImgpileExtractor): + subcategory = "user" + pattern = rf"{BASE_PATTERN}/u/([^/?#]+)" + example = "https://imgpile.com/u/USER" + + def items(self): + url = f"{self.root}/api/v1/posts" + params = { + "limit" : "100", + "sort" : "latest", + "period" : "all", + "visibility": "public", + # "moderation_status": "approved", + "username" : self.groups[0], + } + headers = { + "Accept" : "application/json", + # "Referer" : "https://imgpile.com/u/USER", + "Content-Type" : "application/json", + # "X-CSRF-TOKEN": "", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + } + + base = f"{self.root}/p/" + while True: + data = self.request_json(url, params=params, headers=headers) + + if params is not None: + params = None + self.kwdict["total"] = data["meta"]["total"] + + for item in data["data"]: + item["_extractor"] = ImgpilePostExtractor + url = f"{base}{item['slug']}" + yield Message.Queue, url, item + + url = data["links"].get("next") + if not url: + return diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 9b8f8c9..00e06b5 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -95,7 +95,7 @@ class InstagramExtractor(Extractor): if videos: file["_http_headers"] = videos_headers text.nameext_from_url(url, file) - if videos_dash: + if videos_dash and "_ytdl_manifest_data" in post: file["_fallback"] = (url,) file["_ytdl_manifest"] = "dash" url = f"ytdl:{post['post_url']}{file['num']}.mp4" @@ -505,10 +505,12 @@ class InstagramTaggedExtractor(InstagramExtractor): def metadata(self): if self.item.startswith("id:"): self.user_id = self.item[3:] - return {"tagged_owner_id": self.user_id} - - self.user_id = self.api.user_id(self.item) - user = self.api.user_by_name(self.item) + if not self.config("metadata"): + return {"tagged_owner_id": self.user_id} + user = self.api.user_by_id(self.user_id) + else: + self.user_id = self.api.user_id(self.item) + user = self.api.user_by_name(self.item) return { "tagged_owner_id" : user["id"], diff --git a/gallery_dl/extractor/iwara.py b/gallery_dl/extractor/iwara.py index 179909b..8af2f42 100644 --- a/gallery_dl/extractor/iwara.py +++ b/gallery_dl/extractor/iwara.py @@ -45,6 +45,7 @@ class IwaraExtractor(Extractor): image["id"], exc.__class__.__name__, exc) continue + group_info["type"] = "image" group_info["count"] = len(files) yield Message.Directory, group_info for num, file in enumerate(files, 1): @@ -102,34 +103,37 @@ class IwaraExtractor(Extractor): raise exception.AbortExtraction(f"Unsupported result type '{type}'") def extract_media_info(self, item, key, include_file_info=True): - title = t.strip() if (t := item.get("title")) else "" + info = { + "id" : item["id"], + "slug" : item.get("slug"), + "rating" : item.get("rating"), + "likes" : item.get("numLikes"), + "views" : item.get("numViews"), + "comments": item.get("numComments"), + "tags" : [t["id"] for t in item.get("tags") or ()], + "title" : t.strip() if (t := item.get("title")) else "", + "description": t.strip() if (t := item.get("body")) else "", + } if include_file_info: file_info = item if key is None else item.get(key) or {} filename, _, extension = file_info.get("name", "").rpartition(".") - return { - "id" : item["id"], - "file_id" : file_info.get("id"), - "title" : title, - "filename" : filename, - "extension": extension, - "date" : text.parse_datetime( - file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ"), - "date_updated": text.parse_datetime( - file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ"), - "mime" : file_info.get("mime"), - "size" : file_info.get("size"), - "width" : file_info.get("width"), - "height" : file_info.get("height"), - "duration" : file_info.get("duration"), - "type" : file_info.get("type"), - } - else: - return { - "id" : item["id"], - "title": title, - } + info["file_id"] = file_info.get("id") + info["filename"] = filename + info["extension"] = extension + info["date"] = text.parse_datetime( + file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ") + info["date_updated"] = text.parse_datetime( + file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ") + info["mime"] = file_info.get("mime") + info["size"] = file_info.get("size") + info["width"] = file_info.get("width") + info["height"] = file_info.get("height") + info["duration"] = file_info.get("duration") + info["type"] = file_info.get("type") + + return info def extract_user_info(self, profile): user = profile.get("user") or {} diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py index fc5972c..1f70031 100644 --- a/gallery_dl/extractor/kemono.py +++ b/gallery_dl/extractor/kemono.py @@ -407,7 +407,11 @@ class KemonoDiscordExtractor(KemonoExtractor): r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall find_hash = util.re(HASH_PATTERN).match - posts = self.api.discord_channel(channel_id) + if (order := self.config("order-posts")) and order[0] in ("r", "d"): + posts = self.api.discord_channel(channel_id, channel["post_count"]) + else: + posts = self.api.discord_channel(channel_id) + if max_posts := self.config("max-posts"): posts = itertools.islice(posts, max_posts) @@ -627,9 +631,12 @@ class KemonoAPI(): endpoint = f"/{service}/user/{creator_id}/tags" return self._call(endpoint) - def discord_channel(self, channel_id): + def discord_channel(self, channel_id, post_count=None): endpoint = f"/discord/channel/{channel_id}" - return self._pagination(endpoint, {}, 150) + if post_count is None: + return self._pagination(endpoint, {}, 150) + else: + return self._pagination_reverse(endpoint, {}, 150, post_count) def discord_channel_lookup(self, server_id): endpoint = f"/discord/channel/lookup/{server_id}" @@ -670,3 +677,18 @@ class KemonoAPI(): if len(data) < batch: return params["o"] += batch + + def _pagination_reverse(self, endpoint, params, batch, count): + params["o"] = count // batch * batch + + while True: + data = self._call(endpoint, params) + + if not data: + return + data.reverse() + yield from data + + if not params["o"]: + return + params["o"] -= batch diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index c700a29..b0198d5 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -100,7 +100,8 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" + pattern = (r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)" + r"/(?:i/)?(\w+)") example = "https://lensdump.com/i/ID" def items(self): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 225560d..fbed328 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -96,6 +96,57 @@ class MangadexExtractor(Extractor): return data +class MangadexCoversExtractor(MangadexExtractor): + """Extractor for mangadex manga covers""" + subcategory = "covers" + directory_fmt = ("{category}", "{manga}", "Covers") + filename_fmt = "{volume:>02}_{lang}.{extension}" + archive_fmt = "c_{cover_id}" + pattern = (rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)" + r"(?:/[^/?#]+)?\?tab=art") + example = ("https://mangadex.org/title" + "/01234567-89ab-cdef-0123-456789abcdef?tab=art") + + def items(self): + base = f"{self.root}/covers/{self.uuid}/" + for cover in self.api.covers_manga(self.uuid): + data = self._transform_cover(cover) + name = data["cover"] + text.nameext_from_url(name, data) + data["cover_id"] = data["filename"] + yield Message.Directory, data + yield Message.Url, f"{base}{name}", data + + def _transform_cover(self, cover): + relationships = defaultdict(list) + for item in cover["relationships"]: + relationships[item["type"]].append(item) + manga = self.api.manga(relationships["manga"][0]["id"]) + for item in manga["relationships"]: + relationships[item["type"]].append(item) + + cattributes = cover["attributes"] + mattributes = manga["attributes"] + + return { + "manga" : (mattributes["title"].get("en") or + next(iter(mattributes["title"].values()))), + "manga_id": manga["id"], + "status" : mattributes["status"], + "author" : [author["attributes"]["name"] + for author in relationships["author"]], + "artist" : [artist["attributes"]["name"] + for artist in relationships["artist"]], + "tags" : [tag["attributes"]["name"]["en"] + for tag in mattributes["tags"]], + "cover" : cattributes["fileName"], + "lang" : cattributes.get("locale"), + "volume" : text.parse_int(cattributes["volume"]), + "date" : text.parse_datetime(cattributes["createdAt"]), + "date_updated": text.parse_datetime(cattributes["updatedAt"]), + } + + class MangadexChapterExtractor(MangadexExtractor): """Extractor for manga-chapters from mangadex.org""" subcategory = "chapter" @@ -239,6 +290,10 @@ class MangadexAPI(): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] + def covers_manga(self, uuid): + params = {"manga[]": uuid} + return self._pagination_covers("/cover", params) + def list(self, uuid): return self._call("/list/" + uuid, None, True)["data"] @@ -374,6 +429,20 @@ class MangadexAPI(): return self._pagination(endpoint, params, auth) + def _pagination_covers(self, endpoint, params=None, auth=False): + if params is None: + params = {} + + lang = self.extractor.config("lang") + if isinstance(lang, str) and "," in lang: + lang = lang.split(",") + params["locales"] = lang + params["contentRating"] = None + params["order[volume]"] = \ + "desc" if self.extractor.config("chapter-reverse") else "asc" + + return self._pagination(endpoint, params, auth) + def _pagination(self, endpoint, params, auth=False): config = self.extractor.config diff --git a/gallery_dl/extractor/mangataro.py b/gallery_dl/extractor/mangataro.py new file mode 100644 index 0000000..f4cc058 --- /dev/null +++ b/gallery_dl/extractor/mangataro.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangataro.org/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?mangataro\.org" + + +class MangataroBase(): + """Base class for mangataro extractors""" + category = "mangataro" + root = "https://mangataro.org" + + +class MangataroChapterExtractor(MangataroBase, ChapterExtractor): + """Extractor for mangataro manga chapters""" + pattern = rf"{BASE_PATTERN}(/read/([^/?#]+)/(?:[^/?#]*-)?(\d+))" + example = "https://mangataro.org/read/MANGA/ch123-12345" + + def metadata(self, page): + _, slug, chapter_id = self.groups + comic = self._extract_jsonld(page)["@graph"][0] + chapter = comic["position"] + minor = chapter - int(chapter) + desc = comic["description"].split(" - ", 3) + + return { + **_manga_info(self, slug), + "title" : desc[1] if len(desc) > 3 else "", + "chapter" : int(chapter), + "chapter_minor": str(round(minor, 5))[1:] if minor else "", + "chapter_id" : text.parse_int(chapter_id), + "chapter_url" : comic["url"], + "date" : text.parse_datetime( + comic["datePublished"], "%Y-%m-%dT%H:%M:%S%z"), + "date_updated" : text.parse_datetime( + comic["dateModified"], "%Y-%m-%dT%H:%M:%S%z"), + } + + def images(self, page): + pos = page.find('class="comic-image-container') + img, pos = text.extract(page, ' src="', '"', pos) + + images = [(img, None)] + images.extend( + (url, None) + for url in text.extract_iter(page, 'data-src="', '"', pos) + ) + return images + + +class MangataroMangaExtractor(MangataroBase, MangaExtractor): + """Extractor for mangataro manga""" + chapterclass = MangataroChapterExtractor + pattern = rf"{BASE_PATTERN}(/manga/([^/?#]+))" + example = "https://mangataro.org/manga/MANGA" + + def chapters(self, page): + slug = self.groups[1] + manga = _manga_info(self, slug) + + results = [] + for url in text.extract_iter(text.extr( + page, '<div class="chapter-list', '<div id="tab-gallery"'), + '<a href="', '"'): + chapter, _, chapter_id = url[url.rfind("/")+3:].rpartition("-") + chapter, sep, minor = chapter.partition("-") + results.append((url, { + **manga, + "chapter" : text.parse_int(chapter), + "chapter_minor": f".{minor}" if sep else "", + "chapter_id" : text.parse_int(chapter_id), + })) + return results + + +@memcache(keyarg=1) +def _manga_info(self, slug): + url = f"{self.root}/manga/{slug}" + page = self.request(url).text + manga = self._extract_jsonld(page) + + return { + "manga" : manga["name"].rpartition(" | ")[0].rpartition(" ")[0], + "manga_url" : manga["url"], + "cover" : manga["image"], + "author" : manga["author"]["name"].split(", "), + "genre" : manga["genre"], + "status" : manga["status"], + "description": text.unescape(text.extr( + page, 'id="description-content-tab">', "</div></div>")), + "tags" : text.split_html(text.extr( + page, ">Genres</h4>", "</div>")), + "publisher" : text.remove_html(text.extr( + page, '>Serialization</h4>', "</div>")), + } diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 9c335ad..ff771fb 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -204,58 +204,6 @@ class PinterestExtractor(Extractor): return media -class PinterestPinExtractor(PinterestExtractor): - """Extractor for images from a single pin from pinterest.com""" - subcategory = "pin" - pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" - example = "https://www.pinterest.com/pin/12345/" - - def __init__(self, match): - PinterestExtractor.__init__(self, match) - self.pin_id = match[1] - self.pin = None - - def metadata(self): - self.pin = self.api.pin(self.pin_id) - return self.pin - - def pins(self): - return (self.pin,) - - -class PinterestBoardExtractor(PinterestExtractor): - """Extractor for images from a board from pinterest.com""" - subcategory = "board" - directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") - archive_fmt = "{board[id]}_{id}" - pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" - r"/(?!_saved|_created|pins/)([^/?#]+)/?(?:$|\?|#)") - example = "https://www.pinterest.com/USER/BOARD/" - - def __init__(self, match): - PinterestExtractor.__init__(self, match) - self.user = text.unquote(match[1]) - self.board_name = text.unquote(match[2]) - self.board = None - - def metadata(self): - self.board = self.api.board(self.user, self.board_name) - return {"board": self.board} - - def pins(self): - board = self.board - pins = self.api.board_pins(board["id"]) - - if board["section_count"] and self.config("sections", True): - base = f"{self.root}{board['url']}id:" - data = {"_extractor": PinterestSectionExtractor} - sections = [(base + section["id"], data) - for section in self.api.board_sections(board["id"])] - pins = itertools.chain(pins, sections) - - return pins - - class PinterestUserExtractor(PinterestExtractor): """Extractor for a user's boards""" subcategory = "user" @@ -357,6 +305,58 @@ class PinterestSearchExtractor(PinterestExtractor): return self.api.search(self.search) +class PinterestPinExtractor(PinterestExtractor): + """Extractor for images from a single pin from pinterest.com""" + subcategory = "pin" + pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" + example = "https://www.pinterest.com/pin/12345/" + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.pin_id = match[1] + self.pin = None + + def metadata(self): + self.pin = self.api.pin(self.pin_id) + return self.pin + + def pins(self): + return (self.pin,) + + +class PinterestBoardExtractor(PinterestExtractor): + """Extractor for images from a board from pinterest.com""" + subcategory = "board" + directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") + archive_fmt = "{board[id]}_{id}" + pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" + r"/([^/?#]+)/?(?!.*#related$)") + example = "https://www.pinterest.com/USER/BOARD/" + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match[1]) + self.board_name = text.unquote(match[2]) + self.board = None + + def metadata(self): + self.board = self.api.board(self.user, self.board_name) + return {"board": self.board} + + def pins(self): + board = self.board + pins = self.api.board_pins(board["id"]) + + if board["section_count"] and self.config("sections", True): + base = f"{self.root}{board['url']}id:" + data = {"_extractor": PinterestSectionExtractor} + sections = [(base + section["id"], data) + for section in self.api.board_sections(board["id"])] + pins = itertools.chain(pins, sections) + + return pins + + class PinterestRelatedPinExtractor(PinterestPinExtractor): """Extractor for related pins of another pin from pinterest.com""" subcategory = "related-pin" diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 9febda9..e20d80e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -56,6 +56,7 @@ class RedditExtractor(Extractor): urls = [] if submission: + submission["comment"] = None submission["date"] = text.parse_timestamp( submission["created_utc"]) yield Message.Directory, submission @@ -99,14 +100,13 @@ class RedditExtractor(Extractor): elif not submission["is_self"]: urls.append((url, submission)) + if selftext and (txt := submission["selftext_html"]): + for url in text.extract_iter(txt, ' href="', '"'): + urls.append((url, submission)) + elif parentdir: yield Message.Directory, comments[0] - if selftext and submission: - for url in text.extract_iter( - submission["selftext_html"] or "", ' href="', '"'): - urls.append((url, submission)) - if self.api.comments: if comments and not submission: submission = comments[0] @@ -115,24 +115,24 @@ class RedditExtractor(Extractor): yield Message.Directory, submission for comment in comments: + media = (embeds and "media_metadata" in comment) html = comment["body_html"] or "" href = (' href="' in html) - media = (embeds and "media_metadata" in comment) - if media or href: - comment["date"] = text.parse_timestamp( - comment["created_utc"]) - if submission: - data = submission.copy() - data["comment"] = comment - else: - data = comment + if not media and not href: + continue + + data = submission.copy() + data["comment"] = comment + comment["date"] = text.parse_timestamp( + comment["created_utc"]) if media: - for embed in self._extract_embed(comment): - submission["num"] += 1 - text.nameext_from_url(embed, submission) - yield Message.Url, embed, submission + for url in self._extract_embed(comment): + data["num"] += 1 + text.nameext_from_url(url, data) + yield Message.Url, url, data + submission["num"] = data["num"] if href: for url in text.extract_iter(html, ' href="', '"'): diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py index d517287..dc42417 100644 --- a/gallery_dl/extractor/schalenetwork.py +++ b/gallery_dl/extractor/schalenetwork.py @@ -10,7 +10,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, exception -from ..cache import cache import collections BASE_PATTERN = ( @@ -27,6 +26,8 @@ class SchalenetworkExtractor(Extractor): category = "schalenetwork" root = "https://niyaniya.moe" root_api = "https://api.schale.network" + root_auth = "https://auth.schale.network" + extr_class = None request_interval = (0.5, 1.5) def _init(self): @@ -38,6 +39,7 @@ class SchalenetworkExtractor(Extractor): def _pagination(self, endpoint, params): url_api = self.root_api + endpoint + cls = self.extr_class while True: data = self.request_json( @@ -49,8 +51,8 @@ class SchalenetworkExtractor(Extractor): return for entry in entries: - url = f"{self.root}/g/{entry['id']}/{entry['public_key']}" - entry["_extractor"] = SchalenetworkGalleryExtractor + url = f"{self.root}/g/{entry['id']}/{entry['key']}" + entry["_extractor"] = cls yield Message.Queue, url, entry try: @@ -60,6 +62,34 @@ class SchalenetworkExtractor(Extractor): pass params["page"] += 1 + def _token(self): + if token := self.config("token"): + return f"Bearer {token.rpartition(' ')[2]}" + raise exception.AuthRequired("'token'", "your favorites") + + def _crt(self): + crt = self.config("crt") + if not crt: + self._require_auth() + + if not text.re(r"^[0-9a-f-]+$").match(crt): + path, _, qs = crt.partition("?") + if not qs: + qs = path + crt = text.parse_query(qs).get("crt") + if not crt: + self._require_auth() + + return crt + + def _require_auth(self, exc=None): + if exc is None: + msg = None + else: + msg = f"{exc.status} {exc.response.reason}" + raise exception.AuthRequired( + "'crt' query parameter & matching '--user-agent'", None, msg) + class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): """Extractor for schale.network galleries""" @@ -67,7 +97,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): directory_fmt = ("{category}", "{id} {title}") archive_fmt = "{id}_{num}" request_interval = 0.0 - pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)" + pattern = rf"{BASE_PATTERN}/(?:g|reader)/(\d+)/(\w+)" example = "https://niyaniya.moe/g/12345/67890abcde/" TAG_TYPES = { @@ -86,27 +116,10 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): 12: "other", } - def __init__(self, match): - GalleryExtractor.__init__(self, match) - self.page_url = None - - def _init(self): - self.headers = { - "Accept" : "*/*", - "Referer": self.root + "/", - "Origin" : self.root, - } - - self.fmt = self.config("format") - self.cbz = self.config("cbz", True) - - if self.cbz: - self.filename_fmt = "{id} {title}.{extension}" - self.directory_fmt = ("{category}",) - def metadata(self, _): - url = f"{self.root_api}/books/detail/{self.groups[1]}/{self.groups[2]}" - self.data = data = self.request_json(url, headers=self.headers) + _, gid, gkey = self.groups + url = f"{self.root_api}/books/detail/{gid}/{gkey}" + data = self.request_json(url, headers=self.headers) data["date"] = text.parse_timestamp(data["created_at"] // 1000) tags = [] @@ -127,53 +140,42 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): data["tags_" + types[type]] = values try: - if self.cbz: - data["count"] = len(data["thumbnails"]["entries"]) + data["count"] = len(data["thumbnails"]["entries"]) del data["thumbnails"] - del data["rels"] except Exception: pass return data def images(self, _): - data = self.data - fmt = self._select_format(data["data"]) + crt = self._crt() + _, gid, gkey = self.groups + url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={crt}" + try: + data = self.request_json(url, method="POST", headers=self.headers) + except exception.HttpError as exc: + self._require_auth(exc) - url = (f"{self.root_api}/books/data/{data['id']}/" - f"{data['public_key']}/{fmt['id']}/{fmt['public_key']}") - params = { - "v": data["updated_at"], - "w": fmt["w"], - } + fmt = self._select_format(data["data"]) - if self.cbz: - params["action"] = "dl" - base = self.request_json( - url, method="POST", params=params, headers=self.headers, - )["base"] - url = f"{base}?v={data['updated_at']}&w={fmt['w']}" - info = text.nameext_from_url(base) - if not info["extension"]: - info["extension"] = "cbz" - return ((url, info),) - - data = self.request_json(url, params=params, headers=self.headers) + url = (f"{self.root_api}/books/data/{gid}/{gkey}" + f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={crt}") + data = self.request_json(url, headers=self.headers) base = data["base"] results = [] for entry in data["entries"]: dimensions = entry["dimensions"] info = { - "w": dimensions[0], - "h": dimensions[1], + "width" : dimensions[0], + "height": dimensions[1], "_http_headers": self.headers, } results.append((base + entry["path"], info)) return results def _select_format(self, formats): - fmt = self.fmt + fmt = self.config("format") if not fmt or fmt == "best": fmtids = ("0", "1600", "1280", "980", "780") @@ -182,7 +184,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): elif isinstance(fmt, list): fmtids = fmt else: - fmtids = (str(self.fmt),) + fmtids = (str(fmt),) for fmtid in fmtids: try: @@ -203,44 +205,39 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor): class SchalenetworkSearchExtractor(SchalenetworkExtractor): """Extractor for schale.network search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/\?([^#]*)" - example = "https://niyaniya.moe/?s=QUERY" + pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+)|browse)?(?:/?\?([^#]*))?$" + example = "https://niyaniya.moe/browse?s=QUERY" def items(self): - params = text.parse_query(self.groups[1]) + _, tag, qs = self.groups + + params = text.parse_query(qs) params["page"] = text.parse_int(params.get("page"), 1) + + if tag is not None: + ns, sep, tag = text.unquote(tag).partition(":") + if "+" in tag: + tag = tag.replace("+", " ") + q = '"' + else: + q = "" + q = '"' if " " in tag else "" + params["s"] = f"{ns}{sep}{q}^{tag}${q}" + return self._pagination("/books", params) class SchalenetworkFavoriteExtractor(SchalenetworkExtractor): """Extractor for schale.network favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?" example = "https://niyaniya.moe/favorites" def items(self): - self.login() - params = text.parse_query(self.groups[1]) params["page"] = text.parse_int(params.get("page"), 1) - return self._pagination("/favorites", params) - - def login(self): - username, password = self._get_auth_info() - if username: - self.headers["Authorization"] = \ - "Bearer " + self._login_impl(username, password) - return - - raise exception.AuthenticationError("Username and password required") - - @cache(maxage=86400, keyarg=1) - def _login_impl(self, username, password): - self.log.info("Logging in as %s", username) + self.headers["Authorization"] = self._token() + return self._pagination(f"/books/favorites?crt={self._crt()}", params) - url = "https://auth.schale.network/login" - data = {"uname": username, "passwd": password} - response = self.request( - url, method="POST", headers=self.headers, data=data) - return response.json()["session"] +SchalenetworkExtractor.extr_class = SchalenetworkGalleryExtractor diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py index 8cc7e38..3354289 100644 --- a/gallery_dl/extractor/simpcity.py +++ b/gallery_dl/extractor/simpcity.py @@ -20,18 +20,20 @@ class SimpcityExtractor(Extractor): root = "https://simpcity.cr" def items(self): - extract_urls = text.re(r' href="([^"]+)').findall + extract_urls = text.re( + r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall for post in self.posts(): urls = extract_urls(post["content"]) data = {"post": post} post["count"] = data["count"] = len(urls) + yield Message.Directory, data for data["num"], url in enumerate(urls, 1): yield Message.Queue, url, data def request_page(self, url): try: - return self.request(url).text + return self.request(url) except exception.HttpError as exc: if exc.status == 403 and b">Log in<" in exc.response.content: msg = text.extr(exc.response.text, "blockMessage--error", "</") @@ -44,14 +46,14 @@ class SimpcityExtractor(Extractor): base = f"{self.root}{base}" if pnum is None: - url = base + url = f"{base}/" pnum = 1 else: url = f"{base}/page-{pnum}" pnum = None while True: - page = self.request_page(url) + page = self.request_page(url).text yield page @@ -60,6 +62,31 @@ class SimpcityExtractor(Extractor): pnum += 1 url = f"{base}/page-{pnum}" + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page-9999" # force redirect to last page + with self.request_page(url) as response: + url = response.url + if url[-1] == "/": + pnum = 1 + else: + pnum = text.parse_int(url[url.rfind("-")+1:], 1) + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page-{pnum}" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request_page(url).text + def _parse_thread(self, page): schema = self._extract_jsonld(page)["mainEntity"] author = schema["author"] @@ -92,7 +119,8 @@ class SimpcityExtractor(Extractor): "id": extr('data-content="post-', '"'), "author_url": extr('itemprop="url" content="', '"'), "date": text.parse_datetime(extr('datetime="', '"')), - "content": extr('<div itemprop="text">', "\t\t</div>").strip(), + "content": extr('<div itemprop="text">', + '<div class="js-selectToQuote').strip(), } url_a = post["author_url"] @@ -109,7 +137,7 @@ class SimpcityPostExtractor(SimpcityExtractor): def posts(self): post_id = self.groups[0] url = f"{self.root}/posts/{post_id}/" - page = self.request_page(url) + page = self.request_page(url).text pos = page.find(f'data-content="post-{post_id}"') if pos < 0: @@ -126,10 +154,22 @@ class SimpcityThreadExtractor(SimpcityExtractor): example = "https://simpcity.cr/threads/TITLE.12345/" def posts(self): - for page in self._pagination(*self.groups): + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(*self.groups) + reverse = False + else: + pages = self._pagination_reverse(*self.groups) + reverse = True + + for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) - for html in text.extract_iter(page, "<article ", "</article>"): + posts = text.extract_iter(page, "<article ", "</article>") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: yield self._parse_post(html) diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py new file mode 100644 index 0000000..055d7d8 --- /dev/null +++ b/gallery_dl/extractor/thehentaiworld.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://thehentaiworld.com/""" + +from .common import Extractor, Message +from .. import text, util +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com" + + +class ThehentaiworldExtractor(Extractor): + """Base class for thehentaiworld extractors""" + category = "thehentaiworld" + root = "https://thehentaiworld.com" + filename_fmt = "{title} ({id}{num:?-//}).{extension}" + archive_fmt = "{id}_{num}" + request_interval = (0.5, 1.5) + + def items(self): + for url in self.posts(): + try: + post = self._extract_post(url) + except Exception as exc: + self.status |= 1 + self.log.warning("Failed to extract post %s (%s: %s)", + url, exc.__class__.__name__, exc) + continue + + if "file_urls" in post: + urls = post["file_urls"] + post["count"] = len(urls) + yield Message.Directory, post + for post["num"], url in enumerate(urls, 1): + text.nameext_from_url(url, post) + yield Message.Url, url, post + else: + yield Message.Directory, post + url = post["file_url"] + text.nameext_from_url(url, post) + yield Message.Url, url, post + + def _extract_post(self, url): + extr = text.extract_from(self.request(url).text) + + post = { + "num" : 0, + "count" : 1, + "title" : text.unescape(extr("<title>", "<").strip()), + "id" : text.parse_int(extr(" postid-", " ")), + "slug" : extr(" post-", '"'), + "tags" : extr('id="tagsHead">', "</ul>"), + "date" : text.parse_datetime(extr( + "<li>Posted: ", "<"), "%Y-%m-%d"), + } + + if "/videos/" in url: + post["type"] = "video" + post["width"] = post["height"] = 0 + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + post["file_url"] = extr('<source src="', '"') + else: + post["type"] = "image" + post["width"] = text.parse_int(extr("<li>Size: ", " ")) + post["height"] = text.parse_int(extr("x ", "<")) + post["file_url"] = extr('a href="', '"') + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + + if doujin := extr('<a id="prev-page"', "</div></div><"): + repl = text.re(r"-220x\d+\.").sub + post["file_urls"] = [ + repl(".", url) + for url in text.extract_iter( + doujin, 'class="border" src="', '"') + ] + + tags = collections.defaultdict(list) + pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)') + for tag_type, tag_name in pattern.findall(post["tags"]): + tags[tag_type].append(tag_name) + post["tags"] = tags_list = [] + for key, value in tags.items(): + tags_list.extend(value) + post[f"tags_{key}" if key else "tags_general"] = value + + return post + + def _pagination(self, endpoint): + base = f"{self.root}{endpoint}" + pnum = self.page_start + + while True: + url = base if pnum < 2 else f"{base}page/{pnum}/" + page = self.request(url).text + + yield from text.extract_iter(text.extr( + page, 'id="thumbContainer"', "<script"), ' href="', '"') + + if 'class="next"' not in page: + return + pnum += 1 + + +class ThehentaiworldPostExtractor(ThehentaiworldExtractor): + subcategory = "post" + pattern = (rf"{BASE_PATTERN}" + rf"(/(?:(?:3d-cgi-)?hentai-image|video)s/([^/?#]+))") + example = "https://thehentaiworld.com/hentai-images/SLUG/" + + def posts(self): + return (f"{self.root}{self.groups[0]}/",) + + +class ThehentaiworldTagExtractor(ThehentaiworldExtractor): + subcategory = "tag" + per_page = 24 + page_start = 1 + post_start = 0 + directory_fmt = ("{category}", "{search_tags}") + pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)" + example = "https://thehentaiworld.com/tag/TAG/" + + def posts(self): + self.kwdict["search_tags"] = tag = self.groups[0] + return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start) + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.page_start += pages + self.post_start += posts + return num diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ed3cfae..e6c84d1 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -2070,7 +2070,7 @@ class TwitterAPI(): quoted = tweet["quoted_status_result"]["result"] quoted["legacy"]["quoted_by"] = ( tweet["core"]["user_results"]["result"] - ["legacy"]["screen_name"]) + ["core"]["screen_name"]) quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] quoted["sortIndex"] = entry.get("sortIndex") diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index e53ecf4..294fc57 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -51,8 +51,16 @@ class VipergirlsExtractor(Extractor): like = False posts = root.iter("post") - if self.page: - util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + if self.page: + util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + else: + posts = list(posts) + if self.page: + offset = text.parse_int(self.page[5:]) * 15 + posts = posts[:offset] + posts.reverse() for post in posts: images = list(post) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9d98e68..9369e5d 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -489,9 +489,6 @@ class DownloadJob(Job): self.extractor.cookies_store() - if "finalize" in hooks: - for callback in hooks["finalize"]: - callback(pathfmt) if self.status: if "finalize-error" in hooks: for callback in hooks["finalize-error"]: @@ -500,6 +497,9 @@ class DownloadJob(Job): if "finalize-success" in hooks: for callback in hooks["finalize-success"]: callback(pathfmt) + if "finalize" in hooks: + for callback in hooks["finalize"]: + callback(pathfmt) def handle_skip(self): pathfmt = self.pathfmt diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 8da8417..9992c56 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -54,7 +54,11 @@ class PostProcessor(): else: self.log.debug( "Using %s archive '%s'", self.name, archive_path) + job.register_hooks({"finalize": self._close_archive}) return True self.archive = None return False + + def _close_archive(self, _): + self.archive.close() diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index c74f92f..a6d2b7f 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -45,6 +45,15 @@ class MetadataPP(PostProcessor): cfmt = "\n".join(cfmt) + "\n" self._content_fmt = formatter.parse(cfmt).format_map ext = "txt" + elif mode == "print": + nl = "\n" + if isinstance(cfmt, list): + cfmt = f"{nl.join(cfmt)}{nl}" + if cfmt[-1] != nl and (cfmt[0] != "\f" or cfmt[1] == "F"): + cfmt = f"{cfmt}{nl}" + self.write = self._write_custom + self._content_fmt = formatter.parse(cfmt).format_map + filename = "-" elif mode == "jsonl": self.write = self._write_json self._json_encode = self._make_encoder(options).encode diff --git a/gallery_dl/postprocessor/python.py b/gallery_dl/postprocessor/python.py index db71da2..66d9343 100644 --- a/gallery_dl/postprocessor/python.py +++ b/gallery_dl/postprocessor/python.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,13 +17,14 @@ class PythonPP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) - spec = options["function"] - module_name, _, function_name = spec.rpartition(":") - module = util.import_file(module_name) - self.function = getattr(module, function_name) - - if self._init_archive(job, options): - self.run = self.run_archive + mode = options.get("mode") + if mode == "eval" or not mode and options.get("expression"): + self.function = util.compile_expression(options["expression"]) + else: + spec = options["function"] + module_name, _, function_name = spec.rpartition(":") + module = util.import_file(module_name) + self.function = getattr(module, function_name) events = options.get("event") if events is None: @@ -32,6 +33,9 @@ class PythonPP(PostProcessor): events = events.split(",") job.register_hooks({event: self.run for event in events}, options) + if self._init_archive(job, options): + self.run = self.run_archive + def run(self, pathfmt): self.function(pathfmt.kwdict) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 7b9ce99..49c1ba8 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -542,6 +542,7 @@ def language_to_code(lang, default=None): CODES = { "ar": "Arabic", "bg": "Bulgarian", + "bn": "Bengali", "ca": "Catalan", "cs": "Czech", "da": "Danish", @@ -549,9 +550,11 @@ CODES = { "el": "Greek", "en": "English", "es": "Spanish", + "fa": "Persian", "fi": "Finnish", "fr": "French", "he": "Hebrew", + "hi": "Hindi", "hu": "Hungarian", "id": "Indonesian", "it": "Italian", @@ -564,9 +567,13 @@ CODES = { "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", + "sk": "Slovak", + "sl": "Slovenian", + "sr": "Serbian", "sv": "Swedish", "th": "Thai", "tr": "Turkish", + "uk": "Ukrainian", "vi": "Vietnamese", "zh": "Chinese", } @@ -634,6 +641,12 @@ class NullResponse(): self.url = url self.reason = str(reason) + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + def __str__(self): return "900 " + self.reason diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 277d679..4861a9d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.30.7" +__version__ = "1.30.8" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index cfc6b50..0296498 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -80,7 +80,10 @@ def parse_command_line(module, argv): parser, opts, args = module.parseOpts(argv) ytdlp = hasattr(module, "cookies") - std_headers = module.std_headers + try: + std_headers = module.utils.networking.std_headers + except AttributeError: + std_headers = module.std_headers try: parse_bytes = module.parse_bytes @@ -345,7 +348,7 @@ def parse_command_line(module, argv): "nopart": opts.nopart, "updatetime": opts.updatetime, "writedescription": opts.writedescription, - "writeannotations": opts.writeannotations, + "writeannotations": getattr(opts, "writeannotations", None), "writeinfojson": opts.writeinfojson, "allow_playlist_files": opts.allow_playlist_files, "clean_infojson": opts.clean_infojson, @@ -378,7 +381,8 @@ def parse_command_line(module, argv): "max_views": opts.max_views, "daterange": date, "cachedir": opts.cachedir, - "youtube_print_sig_code": opts.youtube_print_sig_code, + "youtube_print_sig_code": getattr( + opts, "youtube_print_sig_code", None), "age_limit": opts.age_limit, "download_archive": download_archive_fn, "break_on_existing": getattr(opts, "break_on_existing", None), @@ -394,8 +398,8 @@ def parse_command_line(module, argv): "socket_timeout": opts.socket_timeout, "bidi_workaround": opts.bidi_workaround, "debug_printtraffic": opts.debug_printtraffic, - "prefer_ffmpeg": opts.prefer_ffmpeg, - "include_ads": opts.include_ads, + "prefer_ffmpeg": getattr(opts, "prefer_ffmpeg", None), + "include_ads": getattr(opts, "include_ads", None), "default_search": opts.default_search, "dynamic_mpd": getattr(opts, "dynamic_mpd", None), "extractor_args": getattr(opts, "extractor_args", None), @@ -420,7 +424,7 @@ def parse_command_line(module, argv): opts, "sleep_interval_subtitles", None), "external_downloader": opts.external_downloader, "playlist_items": opts.playlist_items, - "xattr_set_filesize": opts.xattr_set_filesize, + "xattr_set_filesize": getattr(opts, "xattr_set_filesize", None), "match_filter": match_filter, "no_color": getattr(opts, "no_color", None), "ffmpeg_location": opts.ffmpeg_location, @@ -430,7 +434,7 @@ def parse_command_line(module, argv): opts, "hls_split_discontinuity", None), "external_downloader_args": opts.external_downloader_args, "postprocessor_args": opts.postprocessor_args, - "cn_verification_proxy": opts.cn_verification_proxy, + "cn_verification_proxy": getattr(opts, "cn_verification_proxy", None), "geo_verification_proxy": opts.geo_verification_proxy, "geo_bypass": getattr( opts, "geo_bypass", "default"), |
