diff options
| author | 2024-12-08 20:34:33 -0500 | |
|---|---|---|
| committer | 2024-12-08 20:34:33 -0500 | |
| commit | f6877087773089220d68288d055276fca6c556d4 (patch) | |
| tree | e4847e3bcff284c3daece7b3b9cf308dfc2129ab /gallery_dl | |
| parent | 1981ccaaea6eab2cf32536ec5afe132a870914d8 (diff) | |
New upstream version 1.28.1.upstream/1.28.1
Diffstat (limited to 'gallery_dl')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/bluesky.py | 20 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 23 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru_v02.py | 64 | ||||
| -rw-r--r-- | gallery_dl/extractor/gofile.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/hentaicosplays.py | 45 | ||||
| -rw-r--r-- | gallery_dl/extractor/inkbunny.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/nhentai.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/patreon.py | 27 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/readcomiconline.py | 41 | ||||
| -rw-r--r-- | gallery_dl/extractor/realbooru.py | 157 | ||||
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 21 | ||||
| -rw-r--r-- | gallery_dl/version.py | 2 |
17 files changed, 319 insertions, 133 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 594ce41..8d5f3d0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -140,6 +140,7 @@ modules = [ "postmill", "reactor", "readcomiconline", + "realbooru", "reddit", "redgifs", "rule34us", diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index bbff17c..f60ea15 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -75,10 +75,13 @@ class BlueskyExtractor(Extractor): quote = embed["record"] if "record" in quote: quote = quote["record"] + value = quote.pop("value", None) + if value is None: + break quote["quote_id"] = self._pid(post) quote["quote_by"] = post["author"] embed = quote.get("embed") - quote.update(quote.pop("value")) + quote.update(value) post = quote def posts(self): @@ -202,6 +205,7 @@ class BlueskyUserExtractor(BlueskyExtractor): def items(self): base = "{}/profile/{}/".format(self.root, self.user) return self._dispatch_extractors(( + (BlueskyInfoExtractor , base + "info"), (BlueskyAvatarExtractor , base + "avatar"), (BlueskyBackgroundExtractor, base + "banner"), (BlueskyPostsExtractor , base + "posts"), @@ -298,6 +302,17 @@ class BlueskyPostExtractor(BlueskyExtractor): return self.api.get_post_thread(self.user, self.post_id) +class BlueskyInfoExtractor(BlueskyExtractor): + subcategory = "info" + pattern = USER_PATTERN + r"/info" + example = "https://bsky.app/profile/HANDLE/info" + + def items(self): + self._metadata_user = True + self.api._did_from_actor(self.user) + return iter(((Message.Directory, self._user),)) + + class BlueskyAvatarExtractor(BlueskyExtractor): subcategory = "avatar" filename_fmt = "avatar_{post_id}.{extension}" @@ -324,7 +339,8 @@ class BlueskySearchExtractor(BlueskyExtractor): example = "https://bsky.app/search?q=QUERY" def posts(self): - return self.api.search_posts(self.user) + query = text.unquote(self.user.replace("+", " ")) + return self.api.search_posts(query) class BlueskyHashtagExtractor(BlueskyExtractor): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f364124..5f9d355 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -348,7 +348,7 @@ class Extractor(): ssl_options = ssl_ciphers = 0 # .netrc Authorization headers are alwsays disabled - session.trust_env = True if self.config("proxy-env", False) else False + session.trust_env = True if self.config("proxy-env", True) else False browser = self.config("browser") if browser is None: @@ -387,8 +387,8 @@ class Extractor(): useragent = self.useragent elif useragent == "browser": useragent = _browser_useragent() - elif useragent is config.get(("extractor",), "user-agent") and \ - useragent == Extractor.useragent: + elif self.useragent is not Extractor.useragent and \ + useragent is config.get(("extractor",), "user-agent"): useragent = self.useragent headers["User-Agent"] = useragent headers["Accept"] = "*/*" diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index c3dfd91..37b6747 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -26,16 +26,7 @@ class DanbooruExtractor(BaseExtractor): def _init(self): self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) - - includes = self.config("metadata") - if includes: - if isinstance(includes, (list, tuple)): - includes = ",".join(includes) - elif not isinstance(includes, str): - includes = "artist_commentary,children,notes,parent,uploader" - self.includes = includes + ",id" - else: - self.includes = False + self.includes = False threshold = self.config("threshold") if isinstance(threshold, int): @@ -56,6 +47,16 @@ class DanbooruExtractor(BaseExtractor): return pages * self.per_page def items(self): + # 'includes' initialization must be done here and not in '_init()' + # or it'll cause an exception with e621 when 'metadata' is enabled + includes = self.config("metadata") + if includes: + if isinstance(includes, (list, tuple)): + includes = ",".join(includes) + elif not isinstance(includes, str): + includes = "artist_commentary,children,notes,parent,uploader" + self.includes = includes + ",id" + data = self.metadata() for post in self.posts(): @@ -223,7 +224,7 @@ class DanbooruTagExtractor(DanbooruExtractor): else: prefix = None elif tag.startswith( - ("id:", "md5", "ordfav:", "ordfavgroup:", "ordpool:")): + ("id:", "md5:", "ordfav:", "ordfavgroup:", "ordpool:")): prefix = None break diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index aad5752..2c1174a 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -24,10 +24,6 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.user_id = self.config("user-id") self.root_api = self.config_instance("root-api") or self.root - if self.category == "realbooru": - self.items = self._items_realbooru - self._tags = self._tags_realbooru - def _api_request(self, params): url = self.root_api + "/index.php?page=dapi&s=post&q=index" return ElementTree.fromstring(self.request(url, params=params).text) @@ -82,16 +78,17 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start * self.per_page data = {} + find_ids = re.compile(r"\sid=\"p(\d+)").findall + while True: - num_ids = 0 page = self.request(url, params=params).text + pids = find_ids(page) - for data["id"] in text.extract_iter(page, '" id="p', '"'): - num_ids += 1 + for data["id"] in pids: for post in self._api_request(data): yield post.attrib - if num_ids < self.per_page: + if len(pids) < self.per_page: return params["pid"] += self.per_page @@ -136,59 +133,8 @@ class GelbooruV02Extractor(booru.BooruExtractor): "body" : text.unescape(text.remove_html(extr(">", "</div>"))), }) - def _file_url_realbooru(self, post): - url = post["file_url"] - md5 = post["md5"] - if md5 not in post["preview_url"] or url.count("/") == 5: - url = "{}/images/{}/{}/{}.{}".format( - self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) - return url - - def _items_realbooru(self): - from .common import Message - data = self.metadata() - - for post in self.posts(): - try: - html = self._html(post) - fallback = post["file_url"] - url = post["file_url"] = text.rextract( - html, 'href="', '"', html.index(">Original<"))[0] - except Exception: - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) - continue - - text.nameext_from_url(url, post) - post.update(data) - self._prepare(post) - self._tags(post, html) - - path = url.rpartition("/")[0] - post["_fallback"] = ( - "{}/{}.{}".format(path, post["md5"], post["extension"]), - fallback, - ) - - yield Message.Directory, post - yield Message.Url, url, post - - def _tags_realbooru(self, post, page): - tag_container = text.extr(page, 'id="tagLink"', '</div>') - tags = collections.defaultdict(list) - pattern = re.compile( - r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') - for tag_type, tag_name in pattern.findall(tag_container): - tags[tag_type].append(text.unescape(text.unquote(tag_name))) - for key, value in tags.items(): - post["tags_" + key] = " ".join(value) - BASE_PATTERN = GelbooruV02Extractor.update({ - "realbooru": { - "root": "https://realbooru.com", - "pattern": r"realbooru\.com", - }, "rule34": { "root": "https://rule34.xxx", "root-api": "https://api.rule34.xxx", diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 52b4ae6..ef9ea60 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -75,8 +75,8 @@ class GofileFolderExtractor(Extractor): @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") - page = self.request(self.root + "/dist/js/alljs.js").text - return text.extr(page, 'wt: "', '"') + page = self.request(self.root + "/dist/js/global.js").text + return text.extr(page, '.wt = "', '"') def _get_content(self, content_id, password=None): headers = {"Authorization": "Bearer " + self.api_token} diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index fbbae16..4992b7b 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -5,31 +5,46 @@ # published by the Free Software Foundation. """Extractors for https://hentai-cosplay-xxx.com/ -(also works for hentai-img.com and porn-images-xxx.com)""" +(also works for hentai-img-xxx.com and porn-image.com)""" -from .common import GalleryExtractor +from .common import BaseExtractor, GalleryExtractor from .. import text -class HentaicosplaysGalleryExtractor(GalleryExtractor): +class HentaicosplaysExtractor(BaseExtractor): + basecategory = "hentaicosplays" + + +BASE_PATTERN = HentaicosplaysExtractor.update({ + "hentaicosplay": { + "root": "https://hentai-cosplay-xxx.com", + "pattern": r"(?:\w\w\.)?hentai-cosplays?(?:-xxx)?\.com", + }, + "hentaiimg": { + "root": "https://hentai-img-xxx.com", + "pattern": r"(?:\w\w\.)?hentai-img(?:-xxx)?\.com", + }, + "pornimage": { + "root": "https://porn-image.com", + "pattern": r"(?:\w\w\.)?porn-images?(?:-xxx)?\.com", + }, +}) + + +class HentaicosplaysGalleryExtractor( + HentaicosplaysExtractor, GalleryExtractor): """Extractor for image galleries from - hentai-cosplay-xxx.com, hentai-img.com, and porn-images-xxx.com""" - category = "hentaicosplays" + hentai-cosplay-xxx.com, hentai-img-xxx.com, and porn-image.com""" directory_fmt = ("{site}", "{title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{title}_{filename}" - pattern = r"((?:https?://)?(?:\w{2}\.)?" \ - r"(hentai-cosplay(?:s|-xxx)|hentai-img|porn-images-xxx)\.com)/" \ - r"(?:image|story)/([\w-]+)" + pattern = BASE_PATTERN + r"/(?:image|story)/([\w-]+)" example = "https://hentai-cosplay-xxx.com/image/TITLE/" def __init__(self, match): - root, self.site, self.slug = match.groups() - self.root = text.ensure_http_scheme(root) - if self.root == "https://hentai-cosplays.com": - self.root = "https://hentai-cosplay-xxx.com" - url = "{}/story/{}/".format(self.root, self.slug) - GalleryExtractor.__init__(self, match, url) + BaseExtractor.__init__(self, match) + self.slug = self.groups[-1] + self.gallery_url = "{}/story/{}/".format(self.root, self.slug) def _init(self): self.session.headers["Referer"] = self.gallery_url @@ -39,7 +54,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): return { "title": text.unescape(title.rpartition(" Story Viewer - ")[0]), "slug" : self.slug, - "site" : self.site, + "site" : self.root.partition("://")[2].rpartition(".")[0], } def images(self, page): diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index bff3156..47e071a 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -338,9 +338,9 @@ class InkbunnyAPI(): def _call(self, endpoint, params): url = "https://inkbunny.net/api_" + endpoint + ".php" - params["sid"] = self.session_id while True: + params["sid"] = self.session_id data = self.extractor.request(url, params=params).json() if "error_code" not in data: diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index a866f45..e6b6b14 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -207,8 +207,8 @@ class InstagramExtractor(Extractor): for user in coauthors ] - if "carousel_media" in post: - items = post["carousel_media"] + items = post.get("carousel_media") + if items: data["sidecar_media_id"] = data["post_id"] data["sidecar_shortcode"] = data["post_shortcode"] else: diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 3d04f75..16c5b99 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -552,7 +552,8 @@ class KemonoAPI(): return response.json() def _pagination(self, endpoint, params, batch=50, key=False): - params["o"] = text.parse_int(params.get("o")) % 50 + offset = text.parse_int(params.get("o")) + params["o"] = offset - offset % batch while True: data = self._call(endpoint, params) diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index 90c5420..0d656d0 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -11,6 +11,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, util import collections +import random class NhentaiGalleryExtractor(GalleryExtractor): @@ -59,15 +60,18 @@ class NhentaiGalleryExtractor(GalleryExtractor): } def images(self, _): - ufmt = ("https://i.nhentai.net/galleries/" + - self.data["media_id"] + "/{}.{}") - extdict = {"j": "jpg", "p": "png", "g": "gif", "w": "webp"} + exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"} + + data = self.data + ufmt = ("https://i{}.nhentai.net/galleries/" + + data["media_id"] + "/{}.{}").format return [ - (ufmt.format(num, extdict.get(img["t"], "jpg")), { - "width": img["w"], "height": img["h"], + (ufmt(random.randint(1, 4), num, exts.get(img["t"], "jpg")), { + "width" : img["w"], + "height": img["h"], }) - for num, img in enumerate(self.data["images"]["pages"], 1) + for num, img in enumerate(data["images"]["pages"], 1) ] diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 3eacf1a..e4a5985 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -23,18 +23,22 @@ class PatreonExtractor(Extractor): directory_fmt = ("{category}", "{creator[full_name]}") filename_fmt = "{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" + useragent = "Patreon/72.2.28 (Android; Android 14; Scale/2.10)" _warning = True def _init(self): - if self.cookies_check(("session_id",)): - self.session.headers["User-Agent"] = \ - "Patreon/72.2.28 (Android; Android 14; Scale/2.10)" - else: + if not self.cookies_check(("session_id",)): if self._warning: PatreonExtractor._warning = False self.log.warning("no 'session_id' cookie set") - self.session.headers["User-Agent"] = \ - "Patreon/7.6.28 (Android; Android 11; Scale/2.10)" + if self.session.headers["User-Agent"] is self.useragent: + self.session.headers["User-Agent"] = \ + "Patreon/7.6.28 (Android; Android 11; Scale/2.10)" + + format_images = self.config("format-images") + if format_images: + self._images_fmt = format_images + self._images_url = self._images_url_fmt def items(self): generators = self._build_file_generators(self.config("files")) @@ -80,11 +84,20 @@ class PatreonExtractor(Extractor): def _images(self, post): for image in post.get("images") or (): - url = image.get("download_url") + url = self._images_url(image) if url: name = image.get("file_name") or self._filename(url) or url yield "image", url, name + def _images_url(self, image): + return image.get("download_url") + + def _images_url_fmt(self, image): + try: + return image["image_urls"][self._images_fmt] + except Exception: + return image.get("download_url") + def _image_large(self, post): image = post.get("image") if image: diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8ad061d..6207bf7 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -380,8 +380,9 @@ class PixivArtworksExtractor(PixivExtractor): ajax_ids.extend(map(int, body["manga"])) ajax_ids.sort() except Exception as exc: - self.log.warning("Unable to collect artwork IDs using AJAX " - "API (%s: %s)", exc.__class__.__name__, exc) + self.log.warning("u%s: Failed to collect artwork IDs " + "using AJAX API (%s: %s)", + self.user_id, exc.__class__.__name__, exc) else: works = self._extend_sanity(works, ajax_ids) @@ -607,8 +608,12 @@ class PixivRankingExtractor(PixivExtractor): def works(self): ranking = self.ranking - for ranking["rank"], work in enumerate( - self.api.illust_ranking(self.mode, self.date), 1): + + works = self.api.illust_ranking(self.mode, self.date) + if self.type: + works = filter(lambda work, t=self.type: work["type"] == t, works) + + for ranking["rank"], work in enumerate(works, 1): yield work def metadata(self): @@ -648,10 +653,13 @@ class PixivRankingExtractor(PixivExtractor): date = (now - timedelta(days=1)).strftime("%Y-%m-%d") self.date = date + self.type = type = query.get("content") + self.ranking = ranking = { "mode": mode, "date": self.date, "rank": 0, + "type": type or "all", } return {"ranking": ranking} diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 271fa50..c0374eb 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -79,13 +79,22 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): def images(self, page): results = [] + referer = {"_http_headers": {"Referer": self.gallery_url}} + root = text.extr(page, "return baeu(l, '", "'") + + replacements = re.findall( + r"l = l\.replace\(/([^/]+)/g, [\"']([^\"']*)", page) for block in page.split(" pth = '")[1:]: pth = text.extr(block, "", "'") + for needle, repl in re.findall( r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block): pth = pth.replace(needle, repl) - results.append((beau(pth), None)) + for needle, repl in replacements: + pth = pth.replace(needle, repl) + + results.append((baeu(pth, root), referer)) return results @@ -119,20 +128,24 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): return results -def beau(url): - """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.1""" +def baeu(url, root="", root_blogspot="https://2.bp.blogspot.com"): + """https://readcomiconline.li/Scripts/rguard.min.js""" + if not root: + root = root_blogspot + url = url.replace("pw_.g28x", "b") url = url.replace("d2pr.x_27", "h") if url.startswith("https"): - return url - - url, sep, rest = url.partition("?") - containsS0 = "=s0" in url - url = url[:-3 if containsS0 else -6] - url = url[15:33] + url[50:] - url = url[0:-11] + url[-2:] - url = binascii.a2b_base64(url).decode() - url = url[0:13] + url[17:] - url = url[0:-2] + ("=s0" if containsS0 else "=s1600") - return "https://2.bp.blogspot.com/" + url + sep + rest + return url.replace(root_blogspot, root, 1) + + path, sep, query = url.partition("?") + + contains_s0 = "=s0" in path + path = path[:-3 if contains_s0 else -6] + path = path[15:33] + path[50:] # step1() + path = path[0:-11] + path[-2:] # step2() + path = binascii.a2b_base64(path).decode() # atob() + path = path[0:13] + path[17:] + path = path[0:-2] + ("=s0" if contains_s0 else "=s1600") + return root + "/" + path + sep + query diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py new file mode 100644 index 0000000..ab8a9b1 --- /dev/null +++ b/gallery_dl/extractor/realbooru.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://realbooru.com/""" + +from . import booru +from .. import text, util +import collections +import re + +BASE_PATTERN = r"(?:https?://)?realbooru\.com" + + +class RealbooruExtractor(booru.BooruExtractor): + basecategory = "booru" + category = "realbooru" + root = "https://realbooru.com" + + def _parse_post(self, post_id): + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post_id) + page = self.request(url).text + extr = text.extract_from(page) + rating = extr('name="rating" content="', '"') + extr('class="container"', '>') + + post = { + "_html" : page, + "id" : post_id, + "rating" : "e" if rating == "adult" else (rating or "?")[0], + "tags" : text.unescape(extr(' alt="', '"')), + "file_url" : extr('src="', '"'), + "created_at": extr(">Posted at ", " by "), + "uploader" : extr(">", "<"), + "score" : extr('">', "<"), + "title" : extr('id="title" style="width: 100%;" value="', '"'), + "source" : extr('d="source" style="width: 100%;" value="', '"'), + } + + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + return post + + def skip(self, num): + self.page_start += num + return num + + def _prepare(self, post): + post["date"] = text.parse_datetime(post["created_at"], "%b, %d %Y") + + def _pagination(self, params, begin, end): + url = self.root + "/index.php" + params["pid"] = self.page_start + + while True: + page = self.request(url, params=params).text + + cnt = 0 + for post_id in text.extract_iter(page, begin, end): + cnt += 1 + yield self._parse_post(post_id) + + if cnt < self.per_page: + return + params["pid"] += self.per_page + + def _tags(self, post, _): + page = post["_html"] + tag_container = text.extr(page, 'id="tagLink"', '</div>') + tags = collections.defaultdict(list) + pattern = re.compile( + r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unescape(text.unquote(tag_name))) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + +class RealbooruTagExtractor(RealbooruExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + per_page = 42 + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)" + example = "https://realbooru.com/index.php?page=post&s=list&tags=TAG" + + def metadata(self): + self.tags = text.unquote(self.groups[0].replace("+", " ")) + return {"search_tags": self.tags} + + def posts(self): + return self._pagination({ + "page": "post", + "s" : "list", + "tags": self.tags, + }, '<a id="p', '"') + + +class RealbooruFavoriteExtractor(RealbooruExtractor): + subcategory = "favorite" + directory_fmt = ("{category}", "favorites", "{favorite_id}") + archive_fmt = "f_{favorite_id}_{id}" + per_page = 50 + pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" + example = "https://realbooru.com/index.php?page=favorites&s=view&id=12345" + + def metadata(self): + return {"favorite_id": text.parse_int(self.groups[0])} + + def posts(self): + return self._pagination({ + "page": "favorites", + "s" : "view", + "id" : self.groups[0], + }, '" id="p', '"') + + +class RealbooruPoolExtractor(RealbooruExtractor): + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool} {pool_name}") + archive_fmt = "p_{pool}_{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)" + example = "https://realbooru.com/index.php?page=pool&s=show&id=12345" + + def metadata(self): + pool_id = self.groups[0] + url = "{}/index.php?page=pool&s=show&id={}".format(self.root, pool_id) + page = self.request(url).text + + name, pos = text.extract(page, "<h4>Pool: ", "</h4>") + self.post_ids = text.extract_iter( + page, 'class="thumb" id="p', '"', pos) + + return { + "pool": text.parse_int(pool_id), + "pool_name": text.unescape(name), + } + + def posts(self): + return map( + self._parse_post, + util.advance(self.post_ids, self.page_start) + ) + + +class RealbooruPostExtractor(RealbooruExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + example = "https://realbooru.com/index.php?page=post&s=view&id=12345" + + def posts(self): + return (self._parse_post(self.groups[0]),) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index f9b1a7f..4c4fb3a 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -145,6 +145,14 @@ class ZerochanTagExtractor(ZerochanExtractor): self.posts = self.posts_api self.session.headers["User-Agent"] = util.USERAGENT + exts = self.config("extensions") + if exts: + if isinstance(exts, str): + exts = exts.split(",") + self.exts = exts + else: + self.exts = ("jpg", "png", "webp", "gif") + def metadata(self): return {"search_tags": text.unquote( self.search_tag.replace("+", " "))} @@ -194,8 +202,6 @@ class ZerochanTagExtractor(ZerochanExtractor): "p" : self.page_start, } - static = "https://static.zerochan.net/.full." - while True: response = self.request(url, params=params, allow_redirects=False) @@ -221,15 +227,20 @@ class ZerochanTagExtractor(ZerochanExtractor): yield post else: for post in posts: - base = static + str(post["id"]) - post["file_url"] = base + ".jpg" - post["_fallback"] = (base + ".png",) + urls = self._urls(post) + post["file_url"] = next(urls) + post["_fallback"] = urls yield post if not data.get("next"): return params["p"] += 1 + def _urls(self, post, static="https://static.zerochan.net/.full."): + base = static + str(post["id"]) + "." + for ext in self.exts: + yield base + ext + class ZerochanImageExtractor(ZerochanExtractor): subcategory = "image" diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 2bf03f4..2dab0d6 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.28.0" +__version__ = "1.28.1" __variant__ = None |
