New upstream version 1.28.1.upstream/1.28.1

author: Unit 193 <unit193@unit193.net> 2024-12-08 20:34:33 -0500
committer: Unit 193 <unit193@unit193.net> 2024-12-08 20:34:33 -0500
commit: f6877087773089220d68288d055276fca6c556d4 (patch)
tree: e4847e3bcff284c3daece7b3b9cf308dfc2129ab /gallery_dl
parent: 1981ccaaea6eab2cf32536ec5afe132a870914d8 (diff)
17 files changed, 319 insertions, 133 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 594ce41..8d5f3d0 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -140,6 +140,7 @@ modules = [
     "postmill",
     "reactor",
     "readcomiconline",
+    "realbooru",
     "reddit",
     "redgifs",
     "rule34us",
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index bbff17c..f60ea15 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -75,10 +75,13 @@ class BlueskyExtractor(Extractor):
                 quote = embed["record"]
                 if "record" in quote:
                     quote = quote["record"]
+                value = quote.pop("value", None)
+                if value is None:
+                    break
                 quote["quote_id"] = self._pid(post)
                 quote["quote_by"] = post["author"]
                 embed = quote.get("embed")
-                quote.update(quote.pop("value"))
+                quote.update(value)
                 post = quote
 
     def posts(self):
@@ -202,6 +205,7 @@ class BlueskyUserExtractor(BlueskyExtractor):
     def items(self):
         base = "{}/profile/{}/".format(self.root, self.user)
         return self._dispatch_extractors((
+            (BlueskyInfoExtractor      , base + "info"),
             (BlueskyAvatarExtractor    , base + "avatar"),
             (BlueskyBackgroundExtractor, base + "banner"),
             (BlueskyPostsExtractor     , base + "posts"),
@@ -298,6 +302,17 @@ class BlueskyPostExtractor(BlueskyExtractor):
         return self.api.get_post_thread(self.user, self.post_id)
 
 
+class BlueskyInfoExtractor(BlueskyExtractor):
+    subcategory = "info"
+    pattern = USER_PATTERN + r"/info"
+    example = "https://bsky.app/profile/HANDLE/info"
+
+    def items(self):
+        self._metadata_user = True
+        self.api._did_from_actor(self.user)
+        return iter(((Message.Directory, self._user),))
+
+
 class BlueskyAvatarExtractor(BlueskyExtractor):
     subcategory = "avatar"
     filename_fmt = "avatar_{post_id}.{extension}"
@@ -324,7 +339,8 @@ class BlueskySearchExtractor(BlueskyExtractor):
     example = "https://bsky.app/search?q=QUERY"
 
     def posts(self):
-        return self.api.search_posts(self.user)
+        query = text.unquote(self.user.replace("+", " "))
+        return self.api.search_posts(query)
 
 
 class BlueskyHashtagExtractor(BlueskyExtractor):
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index f364124..5f9d355 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -348,7 +348,7 @@ class Extractor():
         ssl_options = ssl_ciphers = 0
 
         # .netrc Authorization headers are alwsays disabled
-        session.trust_env = True if self.config("proxy-env", False) else False
+        session.trust_env = True if self.config("proxy-env", True) else False
 
         browser = self.config("browser")
         if browser is None:
@@ -387,8 +387,8 @@ class Extractor():
                 useragent = self.useragent
             elif useragent == "browser":
                 useragent = _browser_useragent()
-            elif useragent is config.get(("extractor",), "user-agent") and \
-                    useragent == Extractor.useragent:
+            elif self.useragent is not Extractor.useragent and \
+                    useragent is config.get(("extractor",), "user-agent"):
                 useragent = self.useragent
             headers["User-Agent"] = useragent
             headers["Accept"] = "*/*"
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index c3dfd91..37b6747 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -26,16 +26,7 @@ class DanbooruExtractor(BaseExtractor):
     def _init(self):
         self.ugoira = self.config("ugoira", False)
         self.external = self.config("external", False)
-
-        includes = self.config("metadata")
-        if includes:
-            if isinstance(includes, (list, tuple)):
-                includes = ",".join(includes)
-            elif not isinstance(includes, str):
-                includes = "artist_commentary,children,notes,parent,uploader"
-            self.includes = includes + ",id"
-        else:
-            self.includes = False
+        self.includes = False
 
         threshold = self.config("threshold")
         if isinstance(threshold, int):
@@ -56,6 +47,16 @@ class DanbooruExtractor(BaseExtractor):
         return pages * self.per_page
 
     def items(self):
+        # 'includes' initialization must be done here and not in '_init()'
+        # or it'll cause an exception with e621 when 'metadata' is enabled
+        includes = self.config("metadata")
+        if includes:
+            if isinstance(includes, (list, tuple)):
+                includes = ",".join(includes)
+            elif not isinstance(includes, str):
+                includes = "artist_commentary,children,notes,parent,uploader"
+            self.includes = includes + ",id"
+
         data = self.metadata()
         for post in self.posts():
 
@@ -223,7 +224,7 @@ class DanbooruTagExtractor(DanbooruExtractor):
                 else:
                     prefix = None
             elif tag.startswith(
-                    ("id:", "md5", "ordfav:", "ordfavgroup:", "ordpool:")):
+                    ("id:", "md5:", "ordfav:", "ordfavgroup:", "ordpool:")):
                 prefix = None
                 break
 
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index aad5752..2c1174a 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -24,10 +24,6 @@ class GelbooruV02Extractor(booru.BooruExtractor):
         self.user_id = self.config("user-id")
         self.root_api = self.config_instance("root-api") or self.root
 
-        if self.category == "realbooru":
-            self.items = self._items_realbooru
-            self._tags = self._tags_realbooru
-
     def _api_request(self, params):
         url = self.root_api + "/index.php?page=dapi&s=post&q=index"
         return ElementTree.fromstring(self.request(url, params=params).text)
@@ -82,16 +78,17 @@ class GelbooruV02Extractor(booru.BooruExtractor):
         params["pid"] = self.page_start * self.per_page
 
         data = {}
+        find_ids = re.compile(r"\sid=\"p(\d+)").findall
+
         while True:
-            num_ids = 0
             page = self.request(url, params=params).text
+            pids = find_ids(page)
 
-            for data["id"] in text.extract_iter(page, '" id="p', '"'):
-                num_ids += 1
+            for data["id"] in pids:
                 for post in self._api_request(data):
                     yield post.attrib
 
-            if num_ids < self.per_page:
+            if len(pids) < self.per_page:
                 return
             params["pid"] += self.per_page
 
@@ -136,59 +133,8 @@ class GelbooruV02Extractor(booru.BooruExtractor):
                 "body"  : text.unescape(text.remove_html(extr(">", "</div>"))),
             })
 
-    def _file_url_realbooru(self, post):
-        url = post["file_url"]
-        md5 = post["md5"]
-        if md5 not in post["preview_url"] or url.count("/") == 5:
-            url = "{}/images/{}/{}/{}.{}".format(
-                self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
-        return url
-
-    def _items_realbooru(self):
-        from .common import Message
-        data = self.metadata()
-
-        for post in self.posts():
-            try:
-                html = self._html(post)
-                fallback = post["file_url"]
-                url = post["file_url"] = text.rextract(
-                    html, 'href="', '"', html.index(">Original<"))[0]
-            except Exception:
-                self.log.debug("Unable to fetch download URL for post %s "
-                               "(md5: %s)", post.get("id"), post.get("md5"))
-                continue
-
-            text.nameext_from_url(url, post)
-            post.update(data)
-            self._prepare(post)
-            self._tags(post, html)
-
-            path = url.rpartition("/")[0]
-            post["_fallback"] = (
-                "{}/{}.{}".format(path, post["md5"], post["extension"]),
-                fallback,
-            )
-
-            yield Message.Directory, post
-            yield Message.Url, url, post
-
-    def _tags_realbooru(self, post, page):
-        tag_container = text.extr(page, 'id="tagLink"', '</div>')
-        tags = collections.defaultdict(list)
-        pattern = re.compile(
-            r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
-        for tag_type, tag_name in pattern.findall(tag_container):
-            tags[tag_type].append(text.unescape(text.unquote(tag_name)))
-        for key, value in tags.items():
-            post["tags_" + key] = " ".join(value)
-
 
 BASE_PATTERN = GelbooruV02Extractor.update({
-    "realbooru": {
-        "root": "https://realbooru.com",
-        "pattern": r"realbooru\.com",
-    },
     "rule34": {
         "root": "https://rule34.xxx",
         "root-api": "https://api.rule34.xxx",
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
index 52b4ae6..ef9ea60 100644
--- a/gallery_dl/extractor/gofile.py
+++ b/gallery_dl/extractor/gofile.py
@@ -75,8 +75,8 @@ class GofileFolderExtractor(Extractor):
     @cache(maxage=86400)
     def _get_website_token(self):
         self.log.debug("Fetching website token")
-        page = self.request(self.root + "/dist/js/alljs.js").text
-        return text.extr(page, 'wt: "', '"')
+        page = self.request(self.root + "/dist/js/global.js").text
+        return text.extr(page, '.wt = "', '"')
 
     def _get_content(self, content_id, password=None):
         headers = {"Authorization": "Bearer " + self.api_token}
diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py
index fbbae16..4992b7b 100644
--- a/gallery_dl/extractor/hentaicosplays.py
+++ b/gallery_dl/extractor/hentaicosplays.py
@@ -5,31 +5,46 @@
 # published by the Free Software Foundation.
 
 """Extractors for https://hentai-cosplay-xxx.com/
-(also works for hentai-img.com and porn-images-xxx.com)"""
+(also works for hentai-img-xxx.com and porn-image.com)"""
 
-from .common import GalleryExtractor
+from .common import BaseExtractor, GalleryExtractor
 from .. import text
 
 
-class HentaicosplaysGalleryExtractor(GalleryExtractor):
+class HentaicosplaysExtractor(BaseExtractor):
+    basecategory = "hentaicosplays"
+
+
+BASE_PATTERN = HentaicosplaysExtractor.update({
+    "hentaicosplay": {
+        "root": "https://hentai-cosplay-xxx.com",
+        "pattern": r"(?:\w\w\.)?hentai-cosplays?(?:-xxx)?\.com",
+    },
+    "hentaiimg": {
+        "root": "https://hentai-img-xxx.com",
+        "pattern": r"(?:\w\w\.)?hentai-img(?:-xxx)?\.com",
+    },
+    "pornimage": {
+        "root": "https://porn-image.com",
+        "pattern": r"(?:\w\w\.)?porn-images?(?:-xxx)?\.com",
+    },
+})
+
+
+class HentaicosplaysGalleryExtractor(
+        HentaicosplaysExtractor, GalleryExtractor):
     """Extractor for image galleries from
-    hentai-cosplay-xxx.com, hentai-img.com, and porn-images-xxx.com"""
-    category = "hentaicosplays"
+    hentai-cosplay-xxx.com, hentai-img-xxx.com, and porn-image.com"""
     directory_fmt = ("{site}", "{title}")
     filename_fmt = "{filename}.{extension}"
     archive_fmt = "{title}_{filename}"
-    pattern = r"((?:https?://)?(?:\w{2}\.)?" \
-              r"(hentai-cosplay(?:s|-xxx)|hentai-img|porn-images-xxx)\.com)/" \
-              r"(?:image|story)/([\w-]+)"
+    pattern = BASE_PATTERN + r"/(?:image|story)/([\w-]+)"
     example = "https://hentai-cosplay-xxx.com/image/TITLE/"
 
     def __init__(self, match):
-        root, self.site, self.slug = match.groups()
-        self.root = text.ensure_http_scheme(root)
-        if self.root == "https://hentai-cosplays.com":
-            self.root = "https://hentai-cosplay-xxx.com"
-        url = "{}/story/{}/".format(self.root, self.slug)
-        GalleryExtractor.__init__(self, match, url)
+        BaseExtractor.__init__(self, match)
+        self.slug = self.groups[-1]
+        self.gallery_url = "{}/story/{}/".format(self.root, self.slug)
 
     def _init(self):
         self.session.headers["Referer"] = self.gallery_url
@@ -39,7 +54,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor):
         return {
             "title": text.unescape(title.rpartition(" Story Viewer - ")[0]),
             "slug" : self.slug,
-            "site" : self.site,
+            "site" : self.root.partition("://")[2].rpartition(".")[0],
         }
 
     def images(self, page):
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index bff3156..47e071a 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -338,9 +338,9 @@ class InkbunnyAPI():
 
     def _call(self, endpoint, params):
         url = "https://inkbunny.net/api_" + endpoint + ".php"
-        params["sid"] = self.session_id
 
         while True:
+            params["sid"] = self.session_id
             data = self.extractor.request(url, params=params).json()
 
             if "error_code" not in data:
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index a866f45..e6b6b14 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -207,8 +207,8 @@ class InstagramExtractor(Extractor):
                     for user in coauthors
                 ]
 
-            if "carousel_media" in post:
-                items = post["carousel_media"]
+            items = post.get("carousel_media")
+            if items:
                 data["sidecar_media_id"] = data["post_id"]
                 data["sidecar_shortcode"] = data["post_shortcode"]
             else:
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 3d04f75..16c5b99 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -552,7 +552,8 @@ class KemonoAPI():
         return response.json()
 
     def _pagination(self, endpoint, params, batch=50, key=False):
-        params["o"] = text.parse_int(params.get("o")) % 50
+        offset = text.parse_int(params.get("o"))
+        params["o"] = offset - offset % batch
 
         while True:
             data = self._call(endpoint, params)
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index 90c5420..0d656d0 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -11,6 +11,7 @@
 from .common import GalleryExtractor, Extractor, Message
 from .. import text, util
 import collections
+import random
 
 
 class NhentaiGalleryExtractor(GalleryExtractor):
@@ -59,15 +60,18 @@ class NhentaiGalleryExtractor(GalleryExtractor):
         }
 
     def images(self, _):
-        ufmt = ("https://i.nhentai.net/galleries/" +
-                self.data["media_id"] + "/{}.{}")
-        extdict = {"j": "jpg", "p": "png", "g": "gif", "w": "webp"}
+        exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}
+
+        data = self.data
+        ufmt = ("https://i{}.nhentai.net/galleries/" +
+                data["media_id"] + "/{}.{}").format
 
         return [
-            (ufmt.format(num, extdict.get(img["t"], "jpg")), {
-                "width": img["w"], "height": img["h"],
+            (ufmt(random.randint(1, 4), num, exts.get(img["t"], "jpg")), {
+                "width" : img["w"],
+                "height": img["h"],
             })
-            for num, img in enumerate(self.data["images"]["pages"], 1)
+            for num, img in enumerate(data["images"]["pages"], 1)
         ]
 
 
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 3eacf1a..e4a5985 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -23,18 +23,22 @@ class PatreonExtractor(Extractor):
     directory_fmt = ("{category}", "{creator[full_name]}")
     filename_fmt = "{id}_{title}_{num:>02}.{extension}"
     archive_fmt = "{id}_{num}"
+    useragent = "Patreon/72.2.28 (Android; Android 14; Scale/2.10)"
     _warning = True
 
     def _init(self):
-        if self.cookies_check(("session_id",)):
-            self.session.headers["User-Agent"] = \
-                "Patreon/72.2.28 (Android; Android 14; Scale/2.10)"
-        else:
+        if not self.cookies_check(("session_id",)):
             if self._warning:
                 PatreonExtractor._warning = False
                 self.log.warning("no 'session_id' cookie set")
-            self.session.headers["User-Agent"] = \
-                "Patreon/7.6.28 (Android; Android 11; Scale/2.10)"
+            if self.session.headers["User-Agent"] is self.useragent:
+                self.session.headers["User-Agent"] = \
+                    "Patreon/7.6.28 (Android; Android 11; Scale/2.10)"
+
+        format_images = self.config("format-images")
+        if format_images:
+            self._images_fmt = format_images
+            self._images_url = self._images_url_fmt
 
     def items(self):
         generators = self._build_file_generators(self.config("files"))
@@ -80,11 +84,20 @@ class PatreonExtractor(Extractor):
 
     def _images(self, post):
         for image in post.get("images") or ():
-            url = image.get("download_url")
+            url = self._images_url(image)
             if url:
                 name = image.get("file_name") or self._filename(url) or url
                 yield "image", url, name
 
+    def _images_url(self, image):
+        return image.get("download_url")
+
+    def _images_url_fmt(self, image):
+        try:
+            return image["image_urls"][self._images_fmt]
+        except Exception:
+            return image.get("download_url")
+
     def _image_large(self, post):
         image = post.get("image")
         if image:
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 8ad061d..6207bf7 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -380,8 +380,9 @@ class PixivArtworksExtractor(PixivExtractor):
                 ajax_ids.extend(map(int, body["manga"]))
                 ajax_ids.sort()
             except Exception as exc:
-                self.log.warning("Unable to collect artwork IDs using AJAX "
-                                 "API (%s: %s)", exc.__class__.__name__, exc)
+                self.log.warning("u%s: Failed to collect artwork IDs "
+                                 "using AJAX API (%s: %s)",
+                                 self.user_id, exc.__class__.__name__, exc)
             else:
                 works = self._extend_sanity(works, ajax_ids)
 
@@ -607,8 +608,12 @@ class PixivRankingExtractor(PixivExtractor):
 
     def works(self):
         ranking = self.ranking
-        for ranking["rank"], work in enumerate(
-                self.api.illust_ranking(self.mode, self.date), 1):
+
+        works = self.api.illust_ranking(self.mode, self.date)
+        if self.type:
+            works = filter(lambda work, t=self.type: work["type"] == t, works)
+
+        for ranking["rank"], work in enumerate(works, 1):
             yield work
 
     def metadata(self):
@@ -648,10 +653,13 @@ class PixivRankingExtractor(PixivExtractor):
             date = (now - timedelta(days=1)).strftime("%Y-%m-%d")
         self.date = date
 
+        self.type = type = query.get("content")
+
         self.ranking = ranking = {
             "mode": mode,
             "date": self.date,
             "rank": 0,
+            "type": type or "all",
         }
         return {"ranking": ranking}
 
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 271fa50..c0374eb 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -79,13 +79,22 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
 
     def images(self, page):
         results = []
+        referer = {"_http_headers": {"Referer": self.gallery_url}}
+        root = text.extr(page, "return baeu(l, '", "'")
+
+        replacements = re.findall(
+            r"l = l\.replace\(/([^/]+)/g, [\"']([^\"']*)", page)
 
         for block in page.split("    pth = '")[1:]:
             pth = text.extr(block, "", "'")
+
             for needle, repl in re.findall(
                     r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block):
                 pth = pth.replace(needle, repl)
-            results.append((beau(pth), None))
+            for needle, repl in replacements:
+                pth = pth.replace(needle, repl)
+
+            results.append((baeu(pth, root), referer))
 
         return results
 
@@ -119,20 +128,24 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
         return results
 
 
-def beau(url):
-    """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.1"""
+def baeu(url, root="", root_blogspot="https://2.bp.blogspot.com"):
+    """https://readcomiconline.li/Scripts/rguard.min.js"""
+    if not root:
+        root = root_blogspot
+
     url = url.replace("pw_.g28x", "b")
     url = url.replace("d2pr.x_27", "h")
 
     if url.startswith("https"):
-        return url
-
-    url, sep, rest = url.partition("?")
-    containsS0 = "=s0" in url
-    url = url[:-3 if containsS0 else -6]
-    url = url[15:33] + url[50:]
-    url = url[0:-11] + url[-2:]
-    url = binascii.a2b_base64(url).decode()
-    url = url[0:13] + url[17:]
-    url = url[0:-2] + ("=s0" if containsS0 else "=s1600")
-    return "https://2.bp.blogspot.com/" + url + sep + rest
+        return url.replace(root_blogspot, root, 1)
+
+    path, sep, query = url.partition("?")
+
+    contains_s0 = "=s0" in path
+    path = path[:-3 if contains_s0 else -6]
+    path = path[15:33] + path[50:]  # step1()
+    path = path[0:-11] + path[-2:]  # step2()
+    path = binascii.a2b_base64(path).decode()  # atob()
+    path = path[0:13] + path[17:]
+    path = path[0:-2] + ("=s0" if contains_s0 else "=s1600")
+    return root + "/" + path + sep + query
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
new file mode 100644
index 0000000..ab8a9b1
--- /dev/null
+++ b/gallery_dl/extractor/realbooru.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://realbooru.com/"""
+
+from . import booru
+from .. import text, util
+import collections
+import re
+
+BASE_PATTERN = r"(?:https?://)?realbooru\.com"
+
+
+class RealbooruExtractor(booru.BooruExtractor):
+    basecategory = "booru"
+    category = "realbooru"
+    root = "https://realbooru.com"
+
+    def _parse_post(self, post_id):
+        url = "{}/index.php?page=post&s=view&id={}".format(
+            self.root, post_id)
+        page = self.request(url).text
+        extr = text.extract_from(page)
+        rating = extr('name="rating" content="', '"')
+        extr('class="container"', '>')
+
+        post = {
+            "_html"     : page,
+            "id"        : post_id,
+            "rating"    : "e" if rating == "adult" else (rating or "?")[0],
+            "tags"      : text.unescape(extr(' alt="', '"')),
+            "file_url"  : extr('src="', '"'),
+            "created_at": extr(">Posted at ", " by "),
+            "uploader"  : extr(">", "<"),
+            "score"     : extr('">', "<"),
+            "title"     : extr('id="title" style="width: 100%;" value="', '"'),
+            "source"    : extr('d="source" style="width: 100%;" value="', '"'),
+        }
+
+        post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
+        return post
+
+    def skip(self, num):
+        self.page_start += num
+        return num
+
+    def _prepare(self, post):
+        post["date"] = text.parse_datetime(post["created_at"], "%b, %d %Y")
+
+    def _pagination(self, params, begin, end):
+        url = self.root + "/index.php"
+        params["pid"] = self.page_start
+
+        while True:
+            page = self.request(url, params=params).text
+
+            cnt = 0
+            for post_id in text.extract_iter(page, begin, end):
+                cnt += 1
+                yield self._parse_post(post_id)
+
+            if cnt < self.per_page:
+                return
+            params["pid"] += self.per_page
+
+    def _tags(self, post, _):
+        page = post["_html"]
+        tag_container = text.extr(page, 'id="tagLink"', '</div>')
+        tags = collections.defaultdict(list)
+        pattern = re.compile(
+            r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
+        for tag_type, tag_name in pattern.findall(tag_container):
+            tags[tag_type].append(text.unescape(text.unquote(tag_name)))
+        for key, value in tags.items():
+            post["tags_" + key] = " ".join(value)
+
+
+class RealbooruTagExtractor(RealbooruExtractor):
+    subcategory = "tag"
+    directory_fmt = ("{category}", "{search_tags}")
+    archive_fmt = "t_{search_tags}_{id}"
+    per_page = 42
+    pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
+    example = "https://realbooru.com/index.php?page=post&s=list&tags=TAG"
+
+    def metadata(self):
+        self.tags = text.unquote(self.groups[0].replace("+", " "))
+        return {"search_tags": self.tags}
+
+    def posts(self):
+        return self._pagination({
+            "page": "post",
+            "s"   : "list",
+            "tags": self.tags,
+        }, '<a id="p', '"')
+
+
+class RealbooruFavoriteExtractor(RealbooruExtractor):
+    subcategory = "favorite"
+    directory_fmt = ("{category}", "favorites", "{favorite_id}")
+    archive_fmt = "f_{favorite_id}_{id}"
+    per_page = 50
+    pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
+    example = "https://realbooru.com/index.php?page=favorites&s=view&id=12345"
+
+    def metadata(self):
+        return {"favorite_id": text.parse_int(self.groups[0])}
+
+    def posts(self):
+        return self._pagination({
+            "page": "favorites",
+            "s"   : "view",
+            "id"  : self.groups[0],
+        }, '" id="p', '"')
+
+
+class RealbooruPoolExtractor(RealbooruExtractor):
+    subcategory = "pool"
+    directory_fmt = ("{category}", "pool", "{pool} {pool_name}")
+    archive_fmt = "p_{pool}_{id}"
+    pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)"
+    example = "https://realbooru.com/index.php?page=pool&s=show&id=12345"
+
+    def metadata(self):
+        pool_id = self.groups[0]
+        url = "{}/index.php?page=pool&s=show&id={}".format(self.root, pool_id)
+        page = self.request(url).text
+
+        name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
+        self.post_ids = text.extract_iter(
+            page, 'class="thumb" id="p', '"', pos)
+
+        return {
+            "pool": text.parse_int(pool_id),
+            "pool_name": text.unescape(name),
+        }
+
+    def posts(self):
+        return map(
+            self._parse_post,
+            util.advance(self.post_ids, self.page_start)
+        )
+
+
+class RealbooruPostExtractor(RealbooruExtractor):
+    subcategory = "post"
+    archive_fmt = "{id}"
+    pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
+    example = "https://realbooru.com/index.php?page=post&s=view&id=12345"
+
+    def posts(self):
+        return (self._parse_post(self.groups[0]),)
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index f9b1a7f..4c4fb3a 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -145,6 +145,14 @@ class ZerochanTagExtractor(ZerochanExtractor):
             self.posts = self.posts_api
             self.session.headers["User-Agent"] = util.USERAGENT
 
+        exts = self.config("extensions")
+        if exts:
+            if isinstance(exts, str):
+                exts = exts.split(",")
+            self.exts = exts
+        else:
+            self.exts = ("jpg", "png", "webp", "gif")
+
     def metadata(self):
         return {"search_tags": text.unquote(
             self.search_tag.replace("+", " "))}
@@ -194,8 +202,6 @@ class ZerochanTagExtractor(ZerochanExtractor):
             "p"   : self.page_start,
         }
 
-        static = "https://static.zerochan.net/.full."
-
         while True:
             response = self.request(url, params=params, allow_redirects=False)
 
@@ -221,15 +227,20 @@ class ZerochanTagExtractor(ZerochanExtractor):
                     yield post
             else:
                 for post in posts:
-                    base = static + str(post["id"])
-                    post["file_url"] = base + ".jpg"
-                    post["_fallback"] = (base + ".png",)
+                    urls = self._urls(post)
+                    post["file_url"] = next(urls)
+                    post["_fallback"] = urls
                     yield post
 
             if not data.get("next"):
                 return
             params["p"] += 1
 
+    def _urls(self, post, static="https://static.zerochan.net/.full."):
+        base = static + str(post["id"]) + "."
+        for ext in self.exts:
+            yield base + ext
+
 
 class ZerochanImageExtractor(ZerochanExtractor):
     subcategory = "image"
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 2bf03f4..2dab0d6 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,5 +6,5 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-__version__ = "1.28.0"
+__version__ = "1.28.1"
 __variant__ = None
author	Unit 193 <unit193@unit193.net>	2024-12-08 20:34:33 -0500
committer	Unit 193 <unit193@unit193.net>	2024-12-08 20:34:33 -0500
commit	f6877087773089220d68288d055276fca6c556d4 (patch)
tree	e4847e3bcff284c3daece7b3b9cf308dfc2129ab /gallery_dl
parent	1981ccaaea6eab2cf32536ec5afe132a870914d8 (diff)