New upstream version 1.27.4.upstream/1.27.4

author: Unit 193 <unit193@unit193.net> 2024-09-07 18:33:19 -0400
committer: Unit 193 <unit193@unit193.net> 2024-09-07 18:33:19 -0400
commit: 1f3ffe32342852fd9ea9e7704022488f3a1222bd (patch)
tree: cb255a091b73e96840de0f6f44b36dff1acab4b9 /gallery_dl/extractor
parent: b5e56c51e491b41f9eb6a895459c185788a377e5 (diff)
24 files changed, 376 insertions, 171 deletions
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
index 2adb142..786acd9 100644
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -51,28 +51,29 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
         if not manga:
             manga = extr('link-hover">', "<")
             info = text.remove_html(extr('link-hover">', "</"))
+        info = text.unescape(info)
 
         match = re.match(
-            r"(?:Volume\s+(\d+) )?"
-            r"\w+\s+(\d+)(.*)", info)
+            r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
+            r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info)
         if match:
             volume, chapter, minor = match.groups()
-            title = text.remove_html(extr(
-                "selected>", "</option")).partition(" : ")[2]
         else:
             volume = chapter = 0
             minor = ""
-            title = info
 
         return {
-            "manga"        : text.unescape(manga),
-            "manga_id"     : text.parse_int(manga_id),
-            "title"        : text.unescape(title),
-            "volume"       : text.parse_int(volume),
-            "chapter"      : text.parse_int(chapter),
-            "chapter_minor": minor,
-            "chapter_id"   : text.parse_int(self.chapter_id),
-            "date"         : text.parse_timestamp(extr(' time="', '"')[:-3]),
+            "manga"         : text.unescape(manga),
+            "manga_id"      : text.parse_int(manga_id),
+            "chapter_url"   : extr(self.chapter_id + "-ch_", '"'),
+            "title"         : text.unescape(text.remove_html(extr(
+                "selected>", "</option")).partition(" : ")[2]),
+            "volume"        : text.parse_int(volume),
+            "chapter"       : text.parse_int(chapter),
+            "chapter_minor" : minor,
+            "chapter_string": info,
+            "chapter_id"    : text.parse_int(self.chapter_id),
+            "date"          : text.parse_timestamp(extr(' time="', '"')[:-3]),
         }
 
     def images(self, page):
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 240bbd3..780bdf1 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -6,15 +6,24 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extractors for https://bunkr.sk/"""
+"""Extractors for https://bunkr.si/"""
 
 from .lolisafe import LolisafeAlbumExtractor
-from .. import text
-
-BASE_PATTERN = (
-    r"(?:https?://)?(?:app\.)?(bunkr+"
-    r"\.(?:s[kiu]|[cf]i|ru|la|is|to|ac|black|cat|media|red|site|ws|org))"
-)
+from .. import text, config
+
+
+if config.get(("extractor", "bunkr"), "tlds"):
+    BASE_PATTERN = (
+        r"(?:bunkr:(?:https?://)?([^/?#]+)|"
+        r"(?:https?://)?(?:app\.)?(bunkr+\.\w+))"
+    )
+else:
+    BASE_PATTERN = (
+        r"(?:bunkr:(?:https?://)?([^/?#]+)|"
+        r"(?:https?://)?(?:app\.)?(bunkr+"
+        r"\.(?:s[kiu]|[cf]i|ru|la|is|to|a[cx]"
+        r"|black|cat|media|red|site|ws|org)))"
+    )
 
 LEGACY_DOMAINS = {
     "bunkr.ru",
@@ -28,15 +37,15 @@ LEGACY_DOMAINS = {
 
 
 class BunkrAlbumExtractor(LolisafeAlbumExtractor):
-    """Extractor for bunkr.sk albums"""
+    """Extractor for bunkr.si albums"""
     category = "bunkr"
-    root = "https://bunkr.sk"
+    root = "https://bunkr.si"
     pattern = BASE_PATTERN + r"/a/([^/?#]+)"
-    example = "https://bunkr.sk/a/ID"
+    example = "https://bunkr.si/a/ID"
 
     def __init__(self, match):
         LolisafeAlbumExtractor.__init__(self, match)
-        domain = match.group(match.lastindex-1)
+        domain = self.groups[0] or self.groups[1]
         if domain not in LEGACY_DOMAINS:
             self.root = "https://" + domain
 
@@ -69,11 +78,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
 
     def _extract_file(self, url):
         page = self.request(url).text
-        return (
-            text.extr(page, '<source src="', '"') or
-            text.extr(page, '<img src="', '"') or
-            text.rextract(page, ' href="', '"', page.rindex("Download"))[0]
-        )
+        url = (text.extr(page, '<source src="', '"') or
+               text.extr(page, '<img src="', '"'))
+
+        if not url:
+            url_download = text.rextract(
+                page, ' href="', '"', page.rindex("Download"))[0]
+            page = self.request(text.unescape(url_download)).text
+            url = text.unescape(text.rextract(page, ' href="', '"')[0])
+
+        return url
 
     def _validate(self, response):
         if response.history and response.url.endswith("/maintenance-vid.mp4"):
@@ -83,11 +97,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
 
 
 class BunkrMediaExtractor(BunkrAlbumExtractor):
-    """Extractor for bunkr.sk media links"""
+    """Extractor for bunkr.si media links"""
     subcategory = "media"
     directory_fmt = ("{category}",)
     pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)"
-    example = "https://bunkr.sk/v/FILENAME"
+    example = "https://bunkr.si/v/FILENAME"
 
     def fetch_album(self, album_id):
         try:
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index d864960..a514696 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -14,6 +14,7 @@ from .. import text
 class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
     category = "cyberdrop"
     root = "https://cyberdrop.me"
+    root_api = "https://api.cyberdrop.me"
     pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)"
     example = "https://cyberdrop.me/a/ID"
 
@@ -55,5 +56,14 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
 
     def _extract_files(self, file_ids):
         for file_id in file_ids:
-            url = "{}/api/f/{}".format(self.root, file_id)
-            yield self.request(url).json()
+            try:
+                url = "{}/api/file/info/{}".format(self.root_api, file_id)
+                file = self.request(url).json()
+                auth = self.request(file["auth_url"]).json()
+                file["url"] = auth["url"]
+            except Exception as exc:
+                self.log.warning("%s (%s: %s)",
+                                 file_id, exc.__class__.__name__, exc)
+                continue
+
+            yield file
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index f3ea4e7..ea70b58 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -69,11 +69,12 @@ class DeviantartExtractor(Extractor):
                 self.quality = ",q_{}".format(self.quality)
                 self.quality_sub = re.compile(r",q_\d+").sub
 
-        if self.original != "image":
-            self._update_content = self._update_content_default
-        else:
-            self._update_content = self._update_content_image
+        if isinstance(self.original, str) and \
+                self.original.lower().startswith("image"):
             self.original = True
+            self._update_content = self._update_content_image
+        else:
+            self._update_content = self._update_content_default
 
         journals = self.config("journals", "html")
         if journals == "html":
@@ -1462,6 +1463,8 @@ class DeviantartOAuthAPI():
                 return
 
             if "next_cursor" in data:
+                if not data["next_cursor"]:
+                    return
                 params["offset"] = None
                 params["cursor"] = data["next_cursor"]
             elif data["next_offset"] is not None:
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index af963bc..553ec22 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -10,6 +10,7 @@
 
 from .common import Message
 from . import danbooru
+from ..cache import memcache
 from .. import text, util
 
 
@@ -44,16 +45,11 @@ class E621Extractor(danbooru.DanbooruExtractor):
                     self.root[8:], md5[0:2], md5[2:4], md5, file["ext"])
 
             if notes and post.get("has_notes"):
-                url = "{}/notes.json?search[post_id]={}".format(
-                    self.root, post["id"])
-                post["notes"] = self.request(url).json()
+                post["notes"] = self._get_notes(post["id"])
 
             if pools and post["pools"]:
-                url = "{}/pools.json?search[id]={}".format(
-                    self.root, ",".join(map(str, post["pools"])))
-                post["pools"] = _pools = self.request(url).json()
-                for pool in _pools:
-                    pool["name"] = pool["name"].replace("_", " ")
+                post["pools"] = self._get_pools(
+                    ",".join(map(str, post["pools"])))
 
             post["filename"] = file["md5"]
             post["extension"] = file["ext"]
@@ -64,6 +60,18 @@ class E621Extractor(danbooru.DanbooruExtractor):
             yield Message.Directory, post
             yield Message.Url, file["url"], post
 
+    def _get_notes(self, id):
+        return self.request(
+            "{}/notes.json?search[post_id]={}".format(self.root, id)).json()
+
+    @memcache(keyarg=1)
+    def _get_pools(self, ids):
+        pools = self.request(
+            "{}/pools.json?search[id]={}".format(self.root, ids)).json()
+        for pool in pools:
+            pool["name"] = pool["name"].replace("_", " ")
+        return pools
+
 
 BASE_PATTERN = E621Extractor.update({
     "e621": {
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 1b4f995..01af7a4 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -430,7 +430,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
         }
 
         page = self.request(url, cookies=cookies).text
-        current = text.extr(page, "<strong>", "</strong>")
+        current = text.extr(page, "<strong>", "</strong>").replace(",", "")
         self.log.debug("Image Limits: %s/%s", current, self.limits)
         self._remaining = self.limits - text.parse_int(current)
 
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index c94a110..1b4971c 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -75,11 +75,8 @@ class FlickrImageExtractor(FlickrExtractor):
 
     def items(self):
         photo = self.api.photos_getInfo(self.item_id)
-        if self.api.exif:
-            photo.update(self.api.photos_getExif(self.item_id))
-        if self.api.contexts:
-            photo.update(self.api.photos_getAllContexts(self.item_id))
 
+        self.api._extract_metadata(photo)
         if photo["media"] == "video" and self.api.videos:
             self.api._extract_video(photo)
         else:
@@ -135,8 +132,13 @@ class FlickrAlbumExtractor(FlickrExtractor):
 
     def metadata(self):
         data = FlickrExtractor.metadata(self)
-        data["album"] = self.api.photosets_getInfo(
-            self.album_id, self.user["nsid"])
+        try:
+            data["album"] = self.api.photosets_getInfo(
+                self.album_id, self.user["nsid"])
+        except Exception:
+            data["album"] = {}
+            self.log.warning("%s: Unable to retrieve album metadata",
+                             self.album_id)
         return data
 
     def photos(self):
@@ -407,6 +409,8 @@ class FlickrAPI(oauth.OAuth1API):
             self.log.debug("Server response: %s", data)
             if data["code"] == 1:
                 raise exception.NotFoundError(self.extractor.subcategory)
+            elif data["code"] == 2:
+                raise exception.AuthorizationError(msg)
             elif data["code"] == 98:
                 raise exception.AuthenticationError(msg)
             elif data["code"] == 99:
@@ -453,10 +457,7 @@ class FlickrAPI(oauth.OAuth1API):
         photo["date"] = text.parse_timestamp(photo["dateupload"])
         photo["tags"] = photo["tags"].split()
 
-        if self.exif:
-            photo.update(self.photos_getExif(photo["id"]))
-        if self.contexts:
-            photo.update(self.photos_getAllContexts(photo["id"]))
+        self._extract_metadata(photo)
         photo["id"] = text.parse_int(photo["id"])
 
         if "owner" in photo:
@@ -512,6 +513,23 @@ class FlickrAPI(oauth.OAuth1API):
         photo["width"] = photo["height"] = 0
         return photo
 
+    def _extract_metadata(self, photo):
+        if self.exif:
+            try:
+                photo.update(self.photos_getExif(photo["id"]))
+            except Exception as exc:
+                self.log.warning(
+                    "Unable to retrieve 'exif' data for %s (%s: %s)",
+                    photo["id"], exc.__class__.__name__, exc)
+
+        if self.contexts:
+            try:
+                photo.update(self.photos_getAllContexts(photo["id"]))
+            except Exception as exc:
+                self.log.warning(
+                    "Unable to retrieve 'contexts' data for %s (%s: %s)",
+                    photo["id"], exc.__class__.__name__, exc)
+
     @staticmethod
     def _clean_info(info):
         info["title"] = info["title"]["_content"]
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 3055426..d253582 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -179,6 +179,11 @@ class FuraffinityExtractor(Extractor):
                     break
                 self._favorite_id = text.parse_int(extr('data-fav-id="', '"'))
                 yield post_id
+
+            pos = page.find('type="submit">Next</button>')
+            if pos >= 0:
+                path = text.rextract(page, '<form action="', '"', pos)[0]
+                continue
             path = text.extr(page, 'right" href="', '"')
 
     def _pagination_search(self, query):
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 16d4340..a6c1d5a 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -15,7 +15,7 @@ import re
 class GenericExtractor(Extractor):
     """Extractor for images in a generic web page."""
     category = "generic"
-    directory_fmt = ("{category}", "{pageurl}")
+    directory_fmt = ("{category}", "{subcategory}", "{path}")
     archive_fmt = "{imageurl}"
 
     # By default, the generic extractor is disabled
@@ -52,7 +52,10 @@ class GenericExtractor(Extractor):
             self.scheme = match.group('scheme')
         else:
             self.scheme = 'https://'
-            self.url = self.scheme + self.url
+            self.url = text.ensure_http_scheme(self.url, self.scheme)
+
+        self.subcategory = match.group('domain')
+        self.path = match.group('path')
 
         # Used to resolve relative image urls
         self.root = self.scheme + match.group('domain')
@@ -87,6 +90,7 @@ class GenericExtractor(Extractor):
     def metadata(self, page):
         """Extract generic webpage metadata, return them in a dict."""
         data = {}
+        data['path'] = self.path.replace("/", "")
         data['pageurl'] = self.url
         data['title'] = text.extr(page, '<title>', "</title>")
         data['description'] = text.extr(
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
index f0eb4e9..52b4ae6 100644
--- a/gallery_dl/extractor/gofile.py
+++ b/gallery_dl/extractor/gofile.py
@@ -47,8 +47,7 @@ class GofileFolderExtractor(Extractor):
             raise exception.AuthorizationError("Password required")
 
         num = 0
-        for content_id in folder["childrenIds"]:
-            content = contents[content_id]
+        for content in contents.values():
             content["folder"] = folder
 
             if content["type"] == "file":
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 9b74700..18df9df 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -89,6 +89,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
                     path = ext = "webp"
             ihash = image["hash"]
             idata = text.nameext_from_url(image["name"])
+            idata["extension_original"] = idata["extension"]
             if ext:
                 idata["extension"] = ext
 
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index c05fe72..422c865 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -12,6 +12,7 @@
 from .common import Extractor, Message
 from .. import text, util, exception
 from ..cache import cache, memcache
+import itertools
 import binascii
 import json
 import re
@@ -57,12 +58,17 @@ class InstagramExtractor(Extractor):
         data = self.metadata()
         videos = self.config("videos", True)
         previews = self.config("previews", False)
+        max_posts = self.config("max-posts")
         video_headers = {"User-Agent": "Mozilla/5.0"}
 
         order = self.config("order-files")
         reverse = order[0] in ("r", "d") if order else False
 
-        for post in self.posts():
+        posts = self.posts()
+        if max_posts:
+            posts = itertools.islice(posts, max_posts)
+
+        for post in posts:
 
             if "__typename" in post:
                 post = self._parse_post_graphql(post)
@@ -159,15 +165,19 @@ class InstagramExtractor(Extractor):
             if "title" in post:
                 data["highlight_title"] = post["title"]
             if "created_at" in post:
-                data["date"] = text.parse_timestamp(post.get("created_at"))
+                data["post_date"] = data["date"] = text.parse_timestamp(
+                    post.get("created_at"))
 
         else:  # regular image/video post
+            date = text.parse_timestamp(post.get("taken_at"))
             data = {
                 "post_id" : post["pk"],
                 "post_shortcode": post["code"],
+                "post_url": "{}/p/{}/".format(self.root, post["code"]),
+                "post_date": date,
+                "date": date,
                 "likes": post.get("like_count", 0),
                 "pinned": post.get("timeline_pinned_user_ids", ()),
-                "date": text.parse_timestamp(post.get("taken_at")),
                 "liked": post.get("has_liked", False),
             }
 
@@ -206,7 +216,6 @@ class InstagramExtractor(Extractor):
         data["owner_id"] = owner["pk"]
         data["username"] = owner.get("username")
         data["fullname"] = owner.get("full_name")
-        data["post_url"] = "{}/p/{}/".format(self.root, data["post_shortcode"])
 
         data["_files"] = files = []
         for num, item in enumerate(items, 1):
@@ -269,7 +278,6 @@ class InstagramExtractor(Extractor):
         owner = post["owner"]
         data = {
             "typename"   : typename,
-            "date"       : text.parse_timestamp(post["taken_at_timestamp"]),
             "likes"      : post["edge_media_preview_like"]["count"],
             "liked"      : post.get("viewer_has_liked", False),
             "pinned"     : pinned,
@@ -279,11 +287,13 @@ class InstagramExtractor(Extractor):
             "post_id"    : post["id"],
             "post_shortcode": post["shortcode"],
             "post_url"   : "{}/p/{}/".format(self.root, post["shortcode"]),
+            "post_date"  : text.parse_timestamp(post["taken_at_timestamp"]),
             "description": text.parse_unicode_escapes("\n".join(
                 edge["node"]["text"]
                 for edge in post["edge_media_to_caption"]["edges"]
             )),
         }
+        data["date"] = data["post_date"]
 
         tags = self._find_tags(data["description"])
         if tags:
@@ -313,6 +323,7 @@ class InstagramExtractor(Extractor):
                 media = {
                     "num": num,
                     "media_id"   : node["id"],
+                    "date"       : data["date"],
                     "shortcode"  : (node.get("shortcode") or
                                     shortcode_from_id(node["id"])),
                     "display_url": node["display_url"],
@@ -328,6 +339,7 @@ class InstagramExtractor(Extractor):
             dimensions = post["dimensions"]
             media = {
                 "media_id"   : post["id"],
+                "date"       : data["date"],
                 "shortcode"  : post["shortcode"],
                 "display_url": post["display_url"],
                 "video_url"  : post.get("video_url"),
@@ -378,7 +390,11 @@ class InstagramExtractor(Extractor):
                                          "full_name": user["full_name"]})
 
     def _init_cursor(self):
-        return self.config("cursor") or None
+        cursor = self.config("cursor", True)
+        if not cursor:
+            self._update_cursor = util.identity
+        elif isinstance(cursor, str):
+            return cursor
 
     def _update_cursor(self, cursor):
         self.log.debug("Cursor: %s", cursor)
@@ -418,6 +434,7 @@ class InstagramUserExtractor(InstagramExtractor):
         base = "{}/{}/".format(self.root, self.item)
         stories = "{}/stories/{}/".format(self.root, self.item)
         return self._dispatch_extractors((
+            (InstagramInfoExtractor      , base + "info/"),
             (InstagramAvatarExtractor    , base + "avatar/"),
             (InstagramStoriesExtractor   , stories),
             (InstagramHighlightsExtractor, base + "highlights/"),
diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py
index 979b1a2..cacf504 100644
--- a/gallery_dl/extractor/koharu.py
+++ b/gallery_dl/extractor/koharu.py
@@ -161,16 +161,29 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
         return results
 
     def _select_format(self, formats):
-        if not self.fmt or self.fmt == "original":
-            fmtid = "0"
+        fmt = self.fmt
+
+        if not fmt or fmt == "best":
+            fmtids = ("0", "1600", "1280", "980", "780")
+        elif isinstance(fmt, str):
+            fmtids = fmt.split(",")
+        elif isinstance(fmt, list):
+            fmtids = fmt
         else:
-            fmtid = str(self.fmt)
+            fmtids = (str(self.fmt),)
 
-        try:
-            fmt = formats[fmtid]
-        except KeyError:
+        for fmtid in fmtids:
+            try:
+                fmt = formats[fmtid]
+                if fmt["id"]:
+                    break
+            except KeyError:
+                self.log.debug("%s: Format %s is not available",
+                               self.groups[0], fmtid)
+        else:
             raise exception.NotFoundError("format")
 
+        self.log.debug("%s: Selected format %s", self.groups[0], fmtid)
         fmt["w"] = fmtid
         return fmt
 
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 3d7d685..117b88b 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -34,7 +34,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
 
     def __init__(self, match):
         LolisafeExtractor.__init__(self, match)
-        self.album_id = match.group(match.lastindex)
+        self.album_id = self.groups[-1]
 
     def _init(self):
         domain = self.config("domain")
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index ecd6619..5fc0ce5 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -171,15 +171,17 @@ class NewgroundsExtractor(Extractor):
             if self.flash:
                 url += "/format/flash"
 
-        with self.request(url, fatal=False) as response:
-            if response.status_code >= 400:
-                return {}
-            page = response.text
+        response = self.request(url, fatal=False)
+        page = response.text
 
         pos = page.find('id="adults_only"')
         if pos >= 0:
             msg = text.extract(page, 'class="highlight">', '<', pos)[0]
             self.log.warning('"%s"', msg)
+            return {}
+
+        if response.status_code >= 400:
+            return {}
 
         extr = text.extract_from(page)
         data = extract_data(extr, post_url)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index d732894..3479b88 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -94,12 +94,39 @@ class PixivExtractor(Extractor):
                         work.get("id"), exc.message)
                     continue
 
-                url = ugoira["zip_urls"]["medium"].replace(
-                    "_ugoira600x600", "_ugoira1920x1080")
-                work["frames"] = ugoira["frames"]
+                url = ugoira["zip_urls"]["medium"]
+                work["frames"] = frames = ugoira["frames"]
                 work["date_url"] = self._date_from_url(url)
                 work["_http_adjust_extension"] = False
-                yield Message.Url, url, text.nameext_from_url(url, work)
+
+                if self.load_ugoira == "original":
+                    base, sep, _ = url.rpartition("_ugoira")
+                    base = base.replace(
+                        "/img-zip-ugoira/", "/img-original/", 1) + sep
+
+                    for ext in ("jpg", "png", "gif"):
+                        try:
+                            url = ("{}0.{}".format(base, ext))
+                            self.request(url, method="HEAD")
+                            break
+                        except exception.HttpError:
+                            pass
+                    else:
+                        self.log.warning(
+                            "Unable to find Ugoira frame URLs (%s)",
+                            work.get("id"))
+                        continue
+
+                    for num, frame in enumerate(frames):
+                        url = ("{}{}.{}".format(base, num, ext))
+                        work["num"] = work["_ugoira_frame_index"] = num
+                        work["suffix"] = "_p{:02}".format(num)
+                        text.nameext_from_url(url, work)
+                        yield Message.Url, url, work
+
+                else:
+                    url = url.replace("_ugoira600x600", "_ugoira1920x1080")
+                    yield Message.Url, url, text.nameext_from_url(url, work)
 
             elif work["page_count"] == 1:
                 url = meta_single_page["original_image_url"]
@@ -551,9 +578,6 @@ class PixivSeriesExtractor(PixivExtractor):
     directory_fmt = ("{category}", "{user[id]} {user[account]}",
                      "{series[id]} {series[title]}")
     filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
-    cookies_domain = ".pixiv.net"
-    browser = "firefox"
-    tls12 = False
     pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)"
     example = "https://www.pixiv.net/user/12345/series/12345"
 
@@ -562,34 +586,18 @@ class PixivSeriesExtractor(PixivExtractor):
         self.user_id, self.series_id = match.groups()
 
     def works(self):
-        url = self.root + "/ajax/series/" + self.series_id
-        params = {"p": 1}
-        headers = {
-            "Accept": "application/json",
-            "Referer": "{}/user/{}/series/{}".format(
-                self.root, self.user_id, self.series_id),
-            "Alt-Used": "www.pixiv.net",
-        }
+        series = None
 
-        while True:
-            data = self.request(url, params=params, headers=headers).json()
-            body = data["body"]
-            page = body["page"]
-
-            series = body["extraData"]["meta"]
-            series["id"] = self.series_id
-            series["total"] = page["total"]
-            series["title"] = text.extr(series["title"], '"', '"')
-
-            for info in page["series"]:
-                work = self.api.illust_detail(info["workId"])
-                work["num_series"] = info["order"]
-                work["series"] = series
-                yield work
-
-            if len(page["series"]) < 10:
-                return
-            params["p"] += 1
+        for work in self.api.illust_series(self.series_id):
+            if series is None:
+                series = self.api.data
+                series["total"] = num_series = series.pop("series_work_count")
+            else:
+                num_series -= 1
+
+            work["num_series"] = num_series
+            work["series"] = series
+            yield work
 
 
 class PixivNovelExtractor(PixivExtractor):
@@ -916,6 +924,11 @@ class PixivAppAPI():
         params = {"illust_id": illust_id}
         return self._pagination("/v2/illust/related", params)
 
+    def illust_series(self, series_id, offset=0):
+        params = {"illust_series_id": series_id, "offset": offset}
+        return self._pagination("/v1/illust/series", params,
+                                key_data="illust_series_detail")
+
     def novel_bookmark_detail(self, novel_id):
         params = {"novel_id": novel_id}
         return self._call(
@@ -1013,10 +1026,15 @@ class PixivAppAPI():
 
             raise exception.StopExtraction("API request failed: %s", error)
 
-    def _pagination(self, endpoint, params, key="illusts"):
+    def _pagination(self, endpoint, params,
+                    key_items="illusts", key_data=None):
         while True:
             data = self._call(endpoint, params)
-            yield from data[key]
+
+            if key_data:
+                self.data = data.get(key_data)
+                key_data = None
+            yield from data[key_items]
 
             if not data["next_url"]:
                 return
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index ad3efa7..7db8172 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -66,7 +66,8 @@ class SankakuExtractor(BooruExtractor):
     def _prepare(self, post):
         post["created_at"] = post["created_at"]["s"]
         post["date"] = text.parse_timestamp(post["created_at"])
-        post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
+        post["tags"] = [tag["name"].lower().replace(" ", "_")
+                        for tag in post["tags"] if tag["name"]]
         post["tag_string"] = " ".join(post["tags"])
         post["_http_validate"] = self._check_expired
 
@@ -79,7 +80,7 @@ class SankakuExtractor(BooruExtractor):
         for tag in post["tags"]:
             name = tag["name"]
             if name:
-                tags[types[tag["type"]]].append(name)
+                tags[types[tag["type"]]].append(name.lower().replace(" ", "_"))
         for key, value in tags.items():
             post["tags_" + key] = value
             post["tag_string_" + key] = " ".join(value)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 80f2aea..7708b5c 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -152,6 +152,25 @@ class SexcomPinsExtractor(SexcomExtractor):
         return self._pagination(url)
 
 
+class SexcomLikesExtractor(SexcomExtractor):
+    """Extractor for a user's liked pins on www.sex.com"""
+    subcategory = "likes"
+    directory_fmt = ("{category}", "{user}", "Likes")
+    pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/"
+    example = "https://www.sex.com/user/USER/likes/"
+
+    def __init__(self, match):
+        SexcomExtractor.__init__(self, match)
+        self.user = match.group(1)
+
+    def metadata(self):
+        return {"user": text.unquote(self.user)}
+
+    def pins(self):
+        url = "{}/user/{}/likes/".format(self.root, self.user)
+        return self._pagination(url)
+
+
 class SexcomBoardExtractor(SexcomExtractor):
     """Extractor for pins from a board on www.sex.com"""
     subcategory = "board"
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index bba1ece..b6917cc 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -86,6 +86,7 @@ BASE_PATTERN = SzurubooruExtractor.update({
     "bcbnsfw": {
         "root": "https://booru.bcbnsfw.space",
         "pattern": r"booru\.bcbnsfw\.space",
+        "query-all": "*",
     },
     "snootbooru": {
         "root": "https://snootbooru.com",
@@ -110,7 +111,12 @@ class SzurubooruTagExtractor(SzurubooruExtractor):
         return {"search_tags": self.query}
 
     def posts(self):
-        return self._pagination("/posts/", {"query": self.query})
+        if self.query.strip():
+            query = self.query
+        else:
+            query = self.config_instance("query-all")
+
+        return self._pagination("/posts/", {"query": query})
 
 
 class SzurubooruPostExtractor(SzurubooruExtractor):
diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py
index 64fa951..44d87ee 100644
--- a/gallery_dl/extractor/toyhouse.py
+++ b/gallery_dl/extractor/toyhouse.py
@@ -123,4 +123,5 @@ class ToyhouseImageExtractor(ToyhouseExtractor):
 
     def posts(self):
         url = "{}/~images/{}".format(self.root, self.user)
-        return (self._parse_post(self.request(url).text, '<img src="'),)
+        return (self._parse_post(
+            self.request(url).text, '<img class="mw-100" src="'),)
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index ff29c04..73455d2 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -400,6 +400,9 @@ class TumblrAPI(oauth.OAuth1API):
         """Retrieve liked posts"""
         endpoint = "/v2/blog/{}/likes".format(blog)
         params = {"limit": "50", "before": self.before}
+        if self.api_key:
+            params["api_key"] = self.api_key
+
         while True:
             posts = self._call(endpoint, params)["liked_posts"]
             if not posts:
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ea57d76..d4ec343 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -155,6 +155,7 @@ class TwitterExtractor(Extractor):
                     if not self.unavailable:
                         continue
 
+            mtype = media.get("type")
             descr = media.get("ext_alt_text")
             width = media["original_info"].get("width", 0)
             height = media["original_info"].get("height", 0)
@@ -164,6 +165,7 @@ class TwitterExtractor(Extractor):
                     files.append({
                         "url": "ytdl:{}/i/web/status/{}".format(
                             self.root, tweet["id_str"]),
+                        "type"       : mtype,
                         "width"      : width,
                         "height"     : height,
                         "extension"  : None,
@@ -177,6 +179,7 @@ class TwitterExtractor(Extractor):
                     )
                     files.append({
                         "url"        : variant["url"],
+                        "type"       : mtype,
                         "width"      : width,
                         "height"     : height,
                         "bitrate"    : variant.get("bitrate", 0),
@@ -193,6 +196,7 @@ class TwitterExtractor(Extractor):
                     base = url.rpartition("=")[0] + "="
                 files.append(text.nameext_from_url(url, {
                     "url"        : base + self._size_image,
+                    "type"       : mtype,
                     "width"      : width,
                     "height"     : height,
                     "_fallback"  : self._image_fallback(base),
@@ -504,7 +508,11 @@ class TwitterExtractor(Extractor):
         }
 
     def _init_cursor(self):
-        return self.config("cursor") or None
+        cursor = self.config("cursor", True)
+        if not cursor:
+            self._update_cursor = util.identity
+        elif isinstance(cursor, str):
+            return cursor
 
     def _update_cursor(self, cursor):
         self.log.debug("Cursor: %s", cursor)
@@ -560,6 +568,7 @@ class TwitterUserExtractor(TwitterExtractor):
     def items(self):
         base = "{}/{}/".format(self.root, self.user)
         return self._dispatch_extractors((
+            (TwitterInfoExtractor      , base + "info"),
             (TwitterAvatarExtractor    , base + "photo"),
             (TwitterBackgroundExtractor, base + "header_photo"),
             (TwitterTimelineExtractor  , base + "timeline"),
@@ -590,9 +599,16 @@ class TwitterTimelineExtractor(TwitterExtractor):
         return cursor
 
     def tweets(self):
-        self._cursor = cursor = self.config("cursor") or None
         reset = False
 
+        cursor = self.config("cursor", True)
+        if not cursor:
+            self._update_cursor = util.identity
+        elif isinstance(cursor, str):
+            self._cursor = cursor
+        else:
+            cursor = None
+
         if cursor:
             state = cursor.partition("/")[0]
             state, _, tweet_id = state.partition("_")
@@ -1612,6 +1628,9 @@ class TwitterAPI():
                             entries = instr["entries"]
                     elif instr_type == "TimelineAddToModule":
                         entries = instr["moduleItems"]
+                    elif instr_type == "TimelinePinEntry":
+                        if pinned_tweet:
+                            pinned_tweet = instr["entry"]
                     elif instr_type == "TimelineReplaceEntry":
                         entry = instr["entry"]
                         if entry["entryId"].startswith("cursor-bottom-"):
@@ -1650,9 +1669,11 @@ class TwitterAPI():
             tweet = None
 
             if pinned_tweet:
-                pinned_tweet = False
-                if instructions[-1]["type"] == "TimelinePinEntry":
+                if isinstance(pinned_tweet, dict):
+                    tweets.append(pinned_tweet)
+                elif instructions[-1]["type"] == "TimelinePinEntry":
                     tweets.append(instructions[-1]["entry"])
+                pinned_tweet = False
 
             for entry in entries:
                 esw = entry["entryId"].startswith
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index 9370cfb..7a62e01 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor):
     """Base class for wikimedia extractors"""
     basecategory = "wikimedia"
     filename_fmt = "{filename} ({sha1[:8]}).{extension}"
-    directory_fmt = ("{category}", "{page}")
     archive_fmt = "{sha1}"
     request_interval = (1.0, 2.0)
 
     def __init__(self, match):
         BaseExtractor.__init__(self, match)
-        path = match.group(match.lastindex)
 
         if self.category == "wikimedia":
             self.category = self.root.split(".")[-2]
@@ -31,31 +29,7 @@ class WikimediaExtractor(BaseExtractor):
             self.category = "{}-{}".format(
                 self.category, self.root.partition(".")[0].rpartition("/")[2])
 
-        if path.startswith("wiki/"):
-            path = path[5:]
-
-        pre, sep, _ = path.partition(":")
-        prefix = pre.lower() if sep else None
-
-        self.title = path = text.unquote(path)
-        if prefix:
-            self.subcategory = prefix
-
-        if prefix == "category":
-            self.params = {
-                "generator": "categorymembers",
-                "gcmtitle" : path,
-                "gcmtype"  : "file",
-            }
-        elif prefix == "file":
-            self.params = {
-                "titles"   : path,
-            }
-        else:
-            self.params = {
-                "generator": "images",
-                "titles"   : path,
-            }
+        self.per_page = self.config("limit", 50)
 
     def _init(self):
         api_path = self.config_instance("api-path")
@@ -67,6 +41,22 @@ class WikimediaExtractor(BaseExtractor):
         else:
             self.api_url = self.root + "/api.php"
 
+    @staticmethod
+    def prepare(image):
+        """Adjust the content of a image object"""
+        image["metadata"] = {
+            m["name"]: m["value"]
+            for m in image["metadata"] or ()}
+        image["commonmetadata"] = {
+            m["name"]: m["value"]
+            for m in image["commonmetadata"] or ()}
+
+        filename = image["canonicaltitle"]
+        image["filename"], _, image["extension"] = \
+            filename.partition(":")[2].rpartition(".")
+        image["date"] = text.parse_datetime(
+            image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+
     def items(self):
         for info in self._pagination(self.params):
             try:
@@ -75,20 +65,7 @@ class WikimediaExtractor(BaseExtractor):
                 self.log.debug("Missing 'imageinfo' for %s", info)
                 continue
 
-            image["metadata"] = {
-                m["name"]: m["value"]
-                for m in image["metadata"] or ()}
-            image["commonmetadata"] = {
-                m["name"]: m["value"]
-                for m in image["commonmetadata"] or ()}
-
-            filename = image["canonicaltitle"]
-            image["filename"], _, image["extension"] = \
-                filename.partition(":")[2].rpartition(".")
-            image["date"] = text.parse_datetime(
-                image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
-            image["page"] = self.title
-
+            self.prepare(image)
             yield Message.Directory, image
             yield Message.Url, image["url"], image
 
@@ -110,6 +87,17 @@ class WikimediaExtractor(BaseExtractor):
         while True:
             data = self.request(url, params=params).json()
 
+            # ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings
+            error = data.get("error")
+            if error:
+                self.log.error("%s: %s", error["code"], error["info"])
+                return
+            # MediaWiki will emit warnings for non-fatal mistakes such as
+            # invalid parameter instead of raising an error
+            warnings = data.get("warnings")
+            if warnings:
+                self.log.debug("MediaWiki returned warnings: %s", warnings)
+
             try:
                 pages = data["query"]["pages"]
             except KeyError:
@@ -181,5 +169,59 @@ BASE_PATTERN = WikimediaExtractor.update({
 class WikimediaArticleExtractor(WikimediaExtractor):
     """Extractor for wikimedia articles"""
     subcategory = "article"
+    directory_fmt = ("{category}", "{page}")
     pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
     example = "https://en.wikipedia.org/wiki/TITLE"
+
+    def __init__(self, match):
+        WikimediaExtractor.__init__(self, match)
+
+        path = match.group(match.lastindex)
+        if path.startswith("wiki/"):
+            path = path[5:]
+
+        pre, sep, _ = path.partition(":")
+        prefix = pre.lower() if sep else None
+
+        self.title = path = text.unquote(path)
+        if prefix:
+            self.subcategory = prefix
+
+        if prefix == "category":
+            self.params = {
+                "generator": "categorymembers",
+                "gcmtitle" : path,
+                "gcmtype"  : "file",
+                "gcmlimit" : self.per_page,
+            }
+        elif prefix == "file":
+            self.params = {
+                "titles"   : path,
+            }
+        else:
+            self.params = {
+                "generator": "images",
+                "gimlimit" : self.per_page,
+                "titles"   : path,
+            }
+
+    def prepare(self, image):
+        WikimediaExtractor.prepare(image)
+        image["page"] = self.title
+
+
+class WikimediaWikiExtractor(WikimediaExtractor):
+    """Extractor for all files on a MediaWiki instance"""
+    subcategory = "wiki"
+    pattern = BASE_PATTERN + r"/?$"
+    example = "https://en.wikipedia.org/"
+
+    def __init__(self, match):
+        WikimediaExtractor.__init__(self, match)
+
+        # ref: https://www.mediawiki.org/wiki/API:Allpages
+        self.params = {
+            "generator"   : "allpages",
+            "gapnamespace": 6,  # "File" namespace
+            "gaplimit"    : self.per_page,
+        }
diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py
index cb3c74c..168845e 100644
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@@ -116,21 +116,20 @@ class YoutubeDLExtractor(Extractor):
         for entry in entries:
             if not entry:
                 continue
-            elif entry.get("_type") in ("url", "url_transparent"):
+
+            if entry.get("_type") in ("url", "url_transparent"):
                 try:
-                    info_dict = ytdl_instance.extract_info(
+                    entry = ytdl_instance.extract_info(
                         entry["url"], False,
                         ie_key=entry.get("ie_key"))
                 except ytdl_module.utils.YoutubeDLError:
                     continue
-
-                if not info_dict:
+                if not entry:
                     continue
-                elif "entries" in info_dict:
-                    yield from self._process_entries(
-                        ytdl_module, ytdl_instance, info_dict["entries"])
-                else:
-                    yield info_dict
+
+            if "entries" in entry:
+                yield from self._process_entries(
+                    ytdl_module, ytdl_instance, entry["entries"])
             else:
                 yield entry
author	Unit 193 <unit193@unit193.net>	2024-09-07 18:33:19 -0400
committer	Unit 193 <unit193@unit193.net>	2024-09-07 18:33:19 -0400
commit	1f3ffe32342852fd9ea9e7704022488f3a1222bd (patch)
tree	cb255a091b73e96840de0f6f44b36dff1acab4b9 /gallery_dl/extractor
parent	b5e56c51e491b41f9eb6a895459c185788a377e5 (diff)