aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2025-10-07 02:11:52 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2025-10-07 02:11:52 -0400
commit83e1e051b8c0e622ef5f61c1955c47b4bde95b57 (patch)
tree544a434cb398d2adb8b8a2d553dc1c9a44b4ee1d /gallery_dl/extractor
parentf1612851ae9fe68c7444fb31e786503868aeaa7c (diff)
parentbbe7fac03d881662a458e7fbf870c9d71f5257f4 (diff)
Update upstream source from tag 'upstream/1.30.9'
Update to upstream version '1.30.9' with Debian dir 46cc56e13f05f4465cc64f67b4d7b775a95bd87a
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/chevereto.py20
-rw-r--r--gallery_dl/extractor/imagehosts.py14
-rw-r--r--gallery_dl/extractor/instagram.py75
-rw-r--r--gallery_dl/extractor/mangadex.py119
-rw-r--r--gallery_dl/extractor/mangafire.py168
-rw-r--r--gallery_dl/extractor/mangareader.py173
-rw-r--r--gallery_dl/extractor/misskey.py6
-rw-r--r--gallery_dl/extractor/nozomi.py2
-rw-r--r--gallery_dl/extractor/paheal.py16
-rw-r--r--gallery_dl/extractor/patreon.py37
-rw-r--r--gallery_dl/extractor/pixiv.py44
-rw-r--r--gallery_dl/extractor/s3ndpics.py101
-rw-r--r--gallery_dl/extractor/schalenetwork.py57
-rw-r--r--gallery_dl/extractor/simpcity.py7
-rw-r--r--gallery_dl/extractor/thehentaiworld.py26
-rw-r--r--gallery_dl/extractor/twitter.py3
-rw-r--r--gallery_dl/extractor/weibo.py36
-rw-r--r--gallery_dl/extractor/wikimedia.py34
-rw-r--r--gallery_dl/extractor/zerochan.py35
20 files changed, 801 insertions, 175 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index abdb6cc..a3df634 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -115,11 +115,13 @@ modules = [
"lynxchan",
"madokami",
"mangadex",
+ "mangafire",
"mangafox",
"mangahere",
"manganelo",
"mangapark",
"mangaread",
+ "mangareader",
"mangataro",
"mangoxo",
"misskey",
@@ -166,6 +168,7 @@ modules = [
"rule34us",
"rule34vault",
"rule34xyz",
+ "s3ndpics",
"saint",
"sankaku",
"sankakucomplex",
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index 67fdb39..1552899 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -40,19 +40,15 @@ class CheveretoExtractor(BaseExtractor):
BASE_PATTERN = CheveretoExtractor.update({
"jpgfish": {
"root": "https://jpg6.su",
- "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
- },
- "imgkiwi": {
- "root": "https://img.kiwi",
- "pattern": r"img\.kiwi",
+ "pattern": r"(?:www\.)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
},
"imagepond": {
"root": "https://imagepond.net",
- "pattern": r"imagepond\.net",
+ "pattern": r"(?:www\.)?imagepond\.net",
},
"imglike": {
"root": "https://imglike.com",
- "pattern": r"imglike\.com",
+ "pattern": r"(?:www\.)?imglike\.com",
},
})
@@ -79,7 +75,7 @@ class CheveretoImageExtractor(CheveretoExtractor):
fromhex=True)
file = {
- "id" : self.path.rpartition(".")[2],
+ "id" : self.path.rpartition("/")[2].rpartition(".")[2],
"url" : url,
"album": text.remove_html(extr(
"Added to <a", "</a>").rpartition(">")[2]),
@@ -144,7 +140,8 @@ class CheveretoAlbumExtractor(CheveretoExtractor):
def items(self):
url = self.root + self.path
- data = {"_extractor": CheveretoImageExtractor}
+ data_image = {"_extractor": CheveretoImageExtractor}
+ data_video = {"_extractor": CheveretoVideoExtractor}
if self.path.endswith("/sub"):
albums = self._pagination(url)
@@ -152,8 +149,9 @@ class CheveretoAlbumExtractor(CheveretoExtractor):
albums = (url,)
for album in albums:
- for image in self._pagination(album):
- yield Message.Queue, image, data
+ for item_url in self._pagination(album):
+ data = data_video if "/video/" in item_url else data_image
+ yield Message.Queue, item_url, data
class CheveretoCategoryExtractor(CheveretoExtractor):
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index fccc466..817d2c4 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -125,8 +125,18 @@ class ImxtoGalleryExtractor(ImagehostImageExtractor):
"title": text.unescape(title.partition(">")[2]).strip(),
}
- for url in text.extract_iter(page, "<a href=", " ", pos):
- yield Message.Queue, url.strip("\"'"), data
+ params = {"page": 1}
+ while True:
+ for url in text.extract_iter(page, "<a href=", " ", pos):
+ if "/i/" in url:
+ yield Message.Queue, url.strip("\"'"), data
+
+ if 'class="pagination' not in page or \
+ 'class="disabled">Last' in page:
+ return
+
+ params["page"] += 1
+ page = self.request(self.page_url, params=params).text
class AcidimgImageExtractor(ImagehostImageExtractor):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 00e06b5..0e6c480 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -39,7 +39,6 @@ class InstagramExtractor(Extractor):
self.www_claim = "0"
self.csrf_token = util.generate_token()
self._find_tags = util.re(r"#\w+").findall
- self._warn_video_ua = True
self._logged_in = True
self._cursor = None
self._user = None
@@ -52,6 +51,12 @@ class InstagramExtractor(Extractor):
else:
self.api = InstagramRestAPI(self)
+ self._warn_video = True if self.config("warn-videos", True) else False
+ self._warn_image = (
+ 9 if not (wi := self.config("warn-images", True)) else
+ 1 if wi in ("all", "both") else
+ 0)
+
def items(self):
self.login()
@@ -172,6 +177,7 @@ class InstagramExtractor(Extractor):
"post_id": reel_id,
"post_shortcode": shortcode_from_id(reel_id),
"post_url": post_url,
+ "type": "story" if expires else "highlight",
}
if "title" in post:
data["highlight_title"] = post["title"]
@@ -182,7 +188,6 @@ class InstagramExtractor(Extractor):
data = {
"post_id" : post["pk"],
"post_shortcode": post["code"],
- "post_url": f"{self.root}/p/{post['code']}/",
"likes": post.get("like_count", 0),
"liked": post.get("has_liked", False),
"pinned": self._extract_pinned(post),
@@ -239,8 +244,8 @@ class InstagramExtractor(Extractor):
manifest = item.get("video_dash_manifest")
media = video
- if self._warn_video_ua:
- self._warn_video_ua = False
+ if self._warn_video:
+ self._warn_video = False
pattern = text.re(
r"Chrome/\d{3,}\.\d+\.\d+\.\d+(?!\d* Mobile)")
if not pattern.search(self.session.headers["User-Agent"]):
@@ -250,8 +255,9 @@ class InstagramExtractor(Extractor):
video = manifest = None
media = image
- if image["width"] < item.get("original_width", 0) or \
- image["height"] < item.get("original_height", 0):
+ if self._warn_image < (
+ (image["width"] < item.get("original_width", 0)) +
+ (image["height"] < item.get("original_height", 0))):
self.log.warning(
"%s: Available image resolutions lower than the "
"original (%sx%s < %sx%s). "
@@ -278,7 +284,7 @@ class InstagramExtractor(Extractor):
if manifest is not None:
media["_ytdl_manifest_data"] = manifest
if "owner" in item:
- media["owner2"] = item["owner"]
+ media["owner"] = item["owner"]
if "reshared_story_media_author" in item:
media["author"] = item["reshared_story_media_author"]
if "expiring_at" in item:
@@ -287,6 +293,14 @@ class InstagramExtractor(Extractor):
self._extract_tagged_users(item, media)
files.append(media)
+ if "type" not in data:
+ if len(files) == 1 and files[0]["video_url"]:
+ data["type"] = "reel"
+ data["post_url"] = f"{self.root}/reel/{post['code']}/"
+ else:
+ data["type"] = "post"
+ data["post_url"] = f"{self.root}/p/{post['code']}/"
+
return data
def _parse_post_graphql(self, post):
@@ -443,6 +457,32 @@ class InstagramExtractor(Extractor):
user[key] = 0
+class InstagramPostExtractor(InstagramExtractor):
+ """Extractor for an Instagram post"""
+ subcategory = "post"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?:share/()|[^/?#]+/)?(?:p|tv|reels?())/([^/?#]+)")
+ example = "https://www.instagram.com/p/abcdefg/"
+
+ def __init__(self, match):
+ if match[2] is not None:
+ self.subcategory = "reel"
+ InstagramExtractor.__init__(self, match)
+
+ def posts(self):
+ share, reel, shortcode = self.groups
+ if share is not None:
+ url = text.ensure_http_scheme(self.url)
+ headers = {
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "same-origin",
+ }
+ location = self.request_location(url, headers=headers)
+ shortcode = location.split("/")[-2]
+ return self.api.media(shortcode)
+
+
class InstagramUserExtractor(Dispatch, InstagramExtractor):
"""Extractor for an Instagram user profile"""
pattern = USER_PATTERN + r"/?(?:$|[?#])"
@@ -740,27 +780,6 @@ class InstagramAvatarExtractor(InstagramExtractor):
},)
-class InstagramPostExtractor(InstagramExtractor):
- """Extractor for an Instagram post"""
- subcategory = "post"
- pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/(?:share/()|[^/?#]+/)?(?:p|tv|reel)/([^/?#]+)")
- example = "https://www.instagram.com/p/abcdefg/"
-
- def posts(self):
- share, shortcode = self.groups
- if share is not None:
- url = text.ensure_http_scheme(self.url)
- headers = {
- "Sec-Fetch-Dest": "empty",
- "Sec-Fetch-Mode": "navigate",
- "Sec-Fetch-Site": "same-origin",
- }
- location = self.request_location(url, headers=headers)
- shortcode = location.split("/")[-2]
- return self.api.media(shortcode)
-
-
class InstagramRestAPI():
def __init__(self, extractor):
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index fbed328..30d6848 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -39,7 +39,7 @@ class MangadexExtractor(Extractor):
data = self._transform(chapter)
data["_extractor"] = MangadexChapterExtractor
self._cache[uuid] = data
- yield Message.Queue, self.root + "/chapter/" + uuid, data
+ yield Message.Queue, f"{self.root}/chapter/{uuid}", data
def _items_manga(self):
data = {"_extractor": MangadexMangaExtractor}
@@ -51,13 +51,8 @@ class MangadexExtractor(Extractor):
relationships = defaultdict(list)
for item in chapter["relationships"]:
relationships[item["type"]].append(item)
- manga = self.api.manga(relationships["manga"][0]["id"])
- for item in manga["relationships"]:
- relationships[item["type"]].append(item)
cattributes = chapter["attributes"]
- mattributes = manga["attributes"]
-
if lang := cattributes.get("translatedLanguage"):
lang = lang.partition("-")[0]
@@ -66,35 +61,21 @@ class MangadexExtractor(Extractor):
else:
chnum, sep, minor = 0, "", ""
- data = {
- "manga" : (mattributes["title"].get("en") or
- next(iter(mattributes["title"].values()))),
- "manga_id": manga["id"],
+ return {
+ **_manga_info(self, relationships["manga"][0]["id"]),
"title" : cattributes["title"],
"volume" : text.parse_int(cattributes["volume"]),
"chapter" : text.parse_int(chnum),
- "chapter_minor": sep + minor,
+ "chapter_minor": f"{sep}{minor}",
"chapter_id": chapter["id"],
"date" : text.parse_datetime(cattributes["publishAt"]),
+ "group" : [group["attributes"]["name"]
+ for group in relationships["scanlation_group"]],
"lang" : lang,
- "language": util.code_to_language(lang),
"count" : cattributes["pages"],
"_external_url": cattributes.get("externalUrl"),
}
- data["artist"] = [artist["attributes"]["name"]
- for artist in relationships["artist"]]
- data["author"] = [author["attributes"]["name"]
- for author in relationships["author"]]
- data["group"] = [group["attributes"]["name"]
- for group in relationships["scanlation_group"]]
-
- data["status"] = mattributes["status"]
- data["tags"] = [tag["attributes"]["name"]["en"]
- for tag in mattributes["tags"]]
-
- return data
-
class MangadexCoversExtractor(MangadexExtractor):
"""Extractor for mangadex manga covers"""
@@ -103,7 +84,7 @@ class MangadexCoversExtractor(MangadexExtractor):
filename_fmt = "{volume:>02}_{lang}.{extension}"
archive_fmt = "c_{cover_id}"
pattern = (rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)"
- r"(?:/[^/?#]+)?\?tab=art")
+ rf"(?:/[^/?#]+)?\?tab=art")
example = ("https://mangadex.org/title"
"/01234567-89ab-cdef-0123-456789abcdef?tab=art")
@@ -121,24 +102,10 @@ class MangadexCoversExtractor(MangadexExtractor):
relationships = defaultdict(list)
for item in cover["relationships"]:
relationships[item["type"]].append(item)
- manga = self.api.manga(relationships["manga"][0]["id"])
- for item in manga["relationships"]:
- relationships[item["type"]].append(item)
-
cattributes = cover["attributes"]
- mattributes = manga["attributes"]
return {
- "manga" : (mattributes["title"].get("en") or
- next(iter(mattributes["title"].values()))),
- "manga_id": manga["id"],
- "status" : mattributes["status"],
- "author" : [author["attributes"]["name"]
- for author in relationships["author"]],
- "artist" : [artist["attributes"]["name"]
- for artist in relationships["artist"]],
- "tags" : [tag["attributes"]["name"]["en"]
- for tag in mattributes["tags"]],
+ **_manga_info(self, relationships["manga"][0]["id"]),
"cover" : cattributes["fileName"],
"lang" : cattributes.get("locale"),
"volume" : text.parse_int(cattributes["volume"]),
@@ -150,7 +117,7 @@ class MangadexCoversExtractor(MangadexExtractor):
class MangadexChapterExtractor(MangadexExtractor):
"""Extractor for manga-chapters from mangadex.org"""
subcategory = "chapter"
- pattern = BASE_PATTERN + r"/chapter/([0-9a-f-]+)"
+ pattern = rf"{BASE_PATTERN}/chapter/([0-9a-f-]+)"
example = ("https://mangadex.org/chapter"
"/01234567-89ab-cdef-0123-456789abcdef")
@@ -177,13 +144,13 @@ class MangadexChapterExtractor(MangadexExtractor):
"page-reverse") else enumerate
for data["page"], page in enum(chapter["data"], 1):
text.nameext_from_url(page, data)
- yield Message.Url, base + page, data
+ yield Message.Url, f"{base}{page}", data
class MangadexMangaExtractor(MangadexExtractor):
"""Extractor for manga from mangadex.org"""
subcategory = "manga"
- pattern = BASE_PATTERN + r"/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)"
+ pattern = rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)"
example = ("https://mangadex.org/title"
"/01234567-89ab-cdef-0123-456789abcdef")
@@ -194,7 +161,7 @@ class MangadexMangaExtractor(MangadexExtractor):
class MangadexFeedExtractor(MangadexExtractor):
"""Extractor for chapters from your Updates Feed"""
subcategory = "feed"
- pattern = BASE_PATTERN + r"/titles?/feed$()"
+ pattern = rf"{BASE_PATTERN}/titles?/feed$()"
example = "https://mangadex.org/title/feed"
def chapters(self):
@@ -204,7 +171,7 @@ class MangadexFeedExtractor(MangadexExtractor):
class MangadexFollowingExtractor(MangadexExtractor):
"""Extractor for followed manga from your Library"""
subcategory = "following"
- pattern = BASE_PATTERN + r"/titles?/follows(?:\?([^#]+))?$"
+ pattern = rf"{BASE_PATTERN}/titles?/follows(?:\?([^#]+))?$"
example = "https://mangadex.org/title/follows"
items = MangadexExtractor._items_manga
@@ -216,8 +183,8 @@ class MangadexFollowingExtractor(MangadexExtractor):
class MangadexListExtractor(MangadexExtractor):
"""Extractor for mangadex MDLists"""
subcategory = "list"
- pattern = (BASE_PATTERN +
- r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?")
+ pattern = (rf"{BASE_PATTERN}"
+ rf"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?")
example = ("https://mangadex.org/list"
"/01234567-89ab-cdef-0123-456789abcdef/NAME")
@@ -242,7 +209,7 @@ class MangadexListExtractor(MangadexExtractor):
class MangadexAuthorExtractor(MangadexExtractor):
"""Extractor for mangadex authors"""
subcategory = "author"
- pattern = BASE_PATTERN + r"/author/([0-9a-f-]+)"
+ pattern = rf"{BASE_PATTERN}/author/([0-9a-f-]+)"
example = ("https://mangadex.org/author"
"/01234567-89ab-cdef-0123-456789abcdef/NAME")
@@ -280,30 +247,30 @@ class MangadexAPI():
else text.ensure_http_scheme(server).rstrip("/"))
def athome_server(self, uuid):
- return self._call("/at-home/server/" + uuid)
+ return self._call(f"/at-home/server/{uuid}")
def author(self, uuid, manga=False):
params = {"includes[]": ("manga",)} if manga else None
- return self._call("/author/" + uuid, params)["data"]
+ return self._call(f"/author/{uuid}", params)["data"]
def chapter(self, uuid):
params = {"includes[]": ("scanlation_group",)}
- return self._call("/chapter/" + uuid, params)["data"]
+ return self._call(f"/chapter/{uuid}", params)["data"]
def covers_manga(self, uuid):
params = {"manga[]": uuid}
return self._pagination_covers("/cover", params)
def list(self, uuid):
- return self._call("/list/" + uuid, None, True)["data"]
+ return self._call(f"/list/{uuid}", None, True)["data"]
def list_feed(self, uuid):
- return self._pagination_chapters("/list/" + uuid + "/feed", None, True)
+ return self._pagination_chapters(f"/list/{uuid}/feed", None, True)
@memcache(keyarg=1)
def manga(self, uuid):
params = {"includes[]": ("artist", "author")}
- return self._call("/manga/" + uuid, params)["data"]
+ return self._call(f"/manga/{uuid}", params)["data"]
def manga_author(self, uuid_author):
params = {"authorOrArtist": uuid_author}
@@ -315,7 +282,7 @@ class MangadexAPI():
"order[volume]" : order,
"order[chapter]": order,
}
- return self._pagination_chapters("/manga/" + uuid + "/feed", params)
+ return self._pagination_chapters(f"/manga/{uuid}/feed", params)
def user_follows_manga(self):
params = {"contentRating": None}
@@ -366,17 +333,17 @@ class MangadexAPI():
_refresh_token_cache.update(
(username, "personal"), data["refresh_token"])
- return "Bearer " + access_token
+ return f"Bearer {access_token}"
@cache(maxage=900, keyarg=1)
def _authenticate_impl_legacy(self, username, password):
if refresh_token := _refresh_token_cache(username):
self.extractor.log.info("Refreshing access token")
- url = self.root + "/auth/refresh"
+ url = f"{self.root}/auth/refresh"
json = {"token": refresh_token}
else:
self.extractor.log.info("Logging in as %s", username)
- url = self.root + "/auth/login"
+ url = f"{self.root}/auth/login"
json = {"username": username, "password": password}
self.extractor.log.debug("Using legacy login method")
@@ -387,10 +354,10 @@ class MangadexAPI():
if refresh_token != data["token"]["refresh"]:
_refresh_token_cache.update(username, data["token"]["refresh"])
- return "Bearer " + data["token"]["session"]
+ return f"Bearer {data['token']['session']}"
def _call(self, endpoint, params=None, auth=False):
- url = self.root + endpoint
+ url = f"{self.root}{endpoint}"
headers = self.headers_auth if auth else self.headers
while True:
@@ -470,3 +437,33 @@ class MangadexAPI():
@cache(maxage=90*86400, keyarg=0)
def _refresh_token_cache(username):
return None
+
+
+@memcache(keyarg=1)
+def _manga_info(self, uuid):
+ manga = self.api.manga(uuid)
+
+ rel = defaultdict(list)
+ for item in manga["relationships"]:
+ rel[item["type"]].append(item)
+ mattr = manga["attributes"]
+
+ return {
+ "manga" : (mattr["title"].get("en") or
+ next(iter(mattr["title"].values()))),
+ "manga_id": manga["id"],
+ "manga_titles": [t.popitem()[1]
+ for t in mattr.get("altTitles") or ()],
+ "manga_date" : text.parse_datetime(mattr.get("createdAt")),
+ "description" : (mattr["description"].get("en") or
+ next(iter(mattr["description"].values()))),
+ "demographic": mattr.get("publicationDemographic"),
+ "origin": mattr.get("originalLanguage"),
+ "status": mattr.get("status"),
+ "year" : mattr.get("year"),
+ "rating": mattr.get("contentRating"),
+ "links" : mattr.get("links"),
+ "tags" : [tag["attributes"]["name"]["en"] for tag in mattr["tags"]],
+ "artist": [artist["attributes"]["name"] for artist in rel["artist"]],
+ "author": [author["attributes"]["name"] for author in rel["author"]],
+ }
diff --git a/gallery_dl/extractor/mangafire.py b/gallery_dl/extractor/mangafire.py
new file mode 100644
index 0000000..5ccb732
--- /dev/null
+++ b/gallery_dl/extractor/mangafire.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://mangafire.to/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, exception
+from ..cache import memcache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangafire\.to"
+
+
+class MangafireBase():
+ """Base class for mangafire extractors"""
+ category = "mangafire"
+ root = "https://mangafire.to"
+
+
+class MangafireChapterExtractor(MangafireBase, ChapterExtractor):
+ """Extractor for mangafire manga chapters"""
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}{chapter:?c//>03}{chapter_minor:?//}{title:?: //}")
+ filename_fmt = (
+ "{manga}{volume:?_v//>02}{chapter:?_c//>03}{chapter_minor:?//}_"
+ "{page:>03}.{extension}")
+ archive_fmt = (
+ "{manga_id}_{chapter_id}_{page}")
+ pattern = (rf"{BASE_PATTERN}/read/([\w-]+\.(\w+))/([\w-]+)"
+ rf"/((chapter|volume)-\d+(?:\D.*)?)")
+ example = "https://mangafire.to/read/MANGA.ID/LANG/chapter-123"
+
+ def metadata(self, _):
+ manga_path, manga_id, lang, chapter_info, self.type = self.groups
+
+ try:
+ chapters = _manga_chapters(self, (manga_id, self.type, lang))
+ anchor = chapters[chapter_info]
+ except KeyError:
+ raise exception.NotFoundError("chapter")
+ self.chapter_id = text.extr(anchor, 'data-id="', '"')
+
+ return {
+ **_manga_info(self, manga_path),
+ **_chapter_info(anchor),
+ }
+
+ def images(self, page):
+ url = f"{self.root}/ajax/read/{self.type}/{self.chapter_id}"
+ headers = {"x-requested-with": "XMLHttpRequest"}
+ data = self.request_json(url, headers=headers)
+
+ return [
+ (image[0], None)
+ for image in data["result"]["images"]
+ ]
+
+
+class MangafireMangaExtractor(MangafireBase, MangaExtractor):
+ """Extractor for mangafire manga"""
+ chapterclass = MangafireChapterExtractor
+ pattern = rf"{BASE_PATTERN}/manga/([\w-]+)\.(\w+)"
+ example = "https://mangafire.to/manga/MANGA.ID"
+
+ def chapters(self, page):
+ manga_slug, manga_id = self.groups
+ lang = self.config("lang") or "en"
+
+ manga = _manga_info(self, f"{manga_slug}.{manga_id}")
+ chapters = _manga_chapters(self, (manga_id, "chapter", lang))
+
+ return [
+ (f"""{self.root}{text.extr(anchor, 'href="', '"')}""", {
+ **manga,
+ **_chapter_info(anchor),
+ })
+ for anchor in chapters.values()
+ ]
+
+
+@memcache(keyarg=1)
+def _manga_info(self, manga_path, page=None):
+ if page is None:
+ url = f"{self.root}/manga/{manga_path}"
+ page = self.request(url).text
+ slug, _, mid = manga_path.rpartition(".")
+
+ extr = text.extract_from(page)
+ manga = {
+ "cover": text.extr(extr(
+ 'class="poster">', '</div>'), 'src="', '"'),
+ "status": extr("<p>", "<").replace("_", " ").title(),
+ "manga" : text.unescape(extr(
+ 'itemprop="name">', "<")),
+ "manga_id": mid,
+ "manga_slug": slug,
+ "manga_titles": text.unescape(extr(
+ "<h6>", "<")).split("; "),
+ "type": text.remove_html(extr(
+ 'class="min-info">', "</a>")),
+ "author": text.unescape(text.remove_html(extr(
+ "<span>Author:</span>", "</div>"))).split(" , "),
+ "published": text.remove_html(extr(
+ "<span>Published:</span>", "</div>")),
+ "tags": text.split_html(extr(
+ "<span>Genres:</span>", "</div>"))[::2],
+ "publisher": text.unescape(text.remove_html(extr(
+ "<span>Mangazines:</span>", "</div>"))).split(" , "),
+ "score": text.parse_float(text.remove_html(extr(
+ 'class="score">', " / "))),
+ "description": text.remove_html(extr(
+ 'id="synopsis">', "<script>")),
+ }
+
+ if len(lst := manga["author"]) == 1 and not lst[0]:
+ manga["author"] = ()
+ if len(lst := manga["publisher"]) == 1 and not lst[0]:
+ manga["publisher"] = ()
+
+ return manga
+
+
+@memcache(keyarg=1)
+def _manga_chapters(self, manga_info):
+ manga_id, type, lang = manga_info
+ url = f"{self.root}/ajax/read/{manga_id}/{type}/{lang}"
+ headers = {"x-requested-with": "XMLHttpRequest"}
+ data = self.request_json(url, headers=headers)
+
+ needle = f"{manga_id}/{lang}/"
+ return {
+ text.extr(anchor, needle, '"'): anchor
+ for anchor in text.extract_iter(data["result"]["html"], "<a ", ">")
+ }
+
+
+@memcache(keyarg=0)
+def _chapter_info(info):
+ _, lang, chapter_info = text.extr(info, 'href="', '"').rsplit("/", 2)
+
+ if chapter_info.startswith("vol"):
+ volume = text.extr(info, 'data-number="', '"')
+ volume_id = text.parse_int(text.extr(info, 'data-id="', '"'))
+ return {
+ "volume" : text.parse_int(volume),
+ "volume_id" : volume_id,
+ "chapter" : 0,
+ "chapter_minor" : "",
+ "chapter_string": chapter_info,
+ "chapter_id" : volume_id,
+ "title" : text.unescape(text.extr(info, 'title="', '"')),
+ "lang" : lang,
+ }
+
+ chapter, sep, minor = text.extr(info, 'data-number="', '"').partition(".")
+ return {
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor" : f"{sep}{minor}",
+ "chapter_string": chapter_info,
+ "chapter_id" : text.parse_int(text.extr(info, 'data-id="', '"')),
+ "title" : text.unescape(text.extr(info, 'title="', '"')),
+ "lang" : lang,
+ }
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
new file mode 100644
index 0000000..eb53998
--- /dev/null
+++ b/gallery_dl/extractor/mangareader.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://mangareader.to/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, util
+from ..cache import memcache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangareader\.to"
+
+
+class MangareaderBase():
+ """Base class for mangareader extractors"""
+ category = "mangareader"
+ root = "https://mangareader.to"
+
+
+class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
+ """Extractor for mangareader manga chapters"""
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}{chapter:?c//>03}{chapter_minor:?//}{title:?: //}")
+ filename_fmt = (
+ "{manga}{volume:?_v//>02}{chapter:?_c//>03}{chapter_minor:?//}_"
+ "{page:>03}.{extension}")
+ archive_fmt = (
+ "{manga_id}_{chapter_id}_{page}")
+ pattern = (rf"{BASE_PATTERN}/read/([\w-]+-\d+)/([^/?#]+)"
+ rf"/(chapter|volume)-(\d+[^/?#]*)")
+ example = "https://mangareader.to/read/MANGA-123/LANG/chapter-123"
+
+ def metadata(self, _):
+ path, lang, type, chstr = self.groups
+
+ settings = util.json_dumps({
+ "readingMode" : "vertical",
+ "readingDirection": "rtl",
+ "quality" : "high",
+ })
+ self.cookies.set("mr_settings", settings, domain="mangareader.to")
+
+ url = f"{self.root}/read/{path}/{lang}/{type}-{chstr}"
+ page = self.request(url).text
+ self.cid = cid = text.extr(page, 'data-reading-id="', '"')
+
+ manga = _manga_info(self, path)
+ return {
+ **manga,
+ **manga[f"_{type}s"][lang][chstr],
+ "chapter_id": text.parse_int(cid),
+ }
+
+ def images(self, page):
+ key = "chap" if self.groups[2] == "chapter" else "vol"
+ url = f"{self.root}/ajax/image/list/{key}/{self.cid}"
+ params = {
+ "mode" : "vertical,",
+ "quality" : "high,",
+ "hozPageSize": "1,",
+ }
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Sec-Fetch-Dest" : "empty",
+ "Sec-Fetch-Mode" : "cors",
+ "Sec-Fetch-Site" : "same-origin",
+ }
+ html = self.request_json(url, params=params, headers=headers)["html"]
+
+ return [
+ (url, None)
+ for url in text.extract_iter(html, 'data-url="', '"')
+ ]
+
+
+class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
+ """Extractor for mangareader manga"""
+ chapterclass = MangareaderChapterExtractor
+ pattern = rf"{BASE_PATTERN}/([\w-]+-\d+)"
+ example = "https://mangareader.to/MANGA-123"
+
+ def chapters(self, page):
+ manga = _manga_info(self, self.groups[0])
+ lang = self.config("lang") or "en"
+
+ return [
+ (info["chapter_url"], {**manga, **info})
+ for info in manga["_chapters"][lang].values()
+ ]
+
+
+@memcache(keyarg=1)
+def _manga_info(self, manga_path):
+ url = f"{self.root}/{manga_path}"
+ html = self.request(url).text
+
+ slug, _, mid = manga_path.rpartition("-")
+ extr = text.extract_from(html)
+ url = extr('property="og:url" content="', '"')
+ manga = {
+ "manga_url": url,
+ "manga_slug": url.rpartition("/")[2].rpartition("-")[0],
+ "manga_id": text.parse_int(mid),
+ "manga": text.unescape(extr('class="manga-name">', "<")),
+ "manga_alt": text.unescape(extr('class="manga-name-or">', "<")),
+ "tags": text.split_html(extr('class="genres">', "</div>")),
+ "type": text.remove_html(extr('>Type:', "</div>")),
+ "status": text.remove_html(extr('>Status:', "</div>")),
+ "author": text.split_html(extr('>Authors:', "</div>"))[0::2],
+ "published": text.remove_html(extr('>Published:', "</div>")),
+ "score": text.parse_float(text.remove_html(extr(
+ '>Score:', "</div>"))),
+ "views": text.parse_int(text.remove_html(extr(
+ '>Views:', "</div>")).replace(",", "")),
+ }
+
+ base = self.root
+
+ # extract all chapters
+ html = extr('class="chapters-list-ul">', " </div>")
+ manga["_chapters"] = chapters = {}
+ for group in text.extract_iter(html, "<ul", "</ul>"):
+ lang = text.extr(group, ' id="', '-chapters"')
+
+ chapters[lang] = current = {}
+ lang = lang.partition("-")[0]
+ for ch in text.extract_iter(group, "<li ", "</li>"):
+ path = text.extr(ch, 'href="', '"')
+ chap = text.extr(ch, 'data-number="', '"')
+ name = text.unescape(text.extr(ch, 'class="name">', "<"))
+
+ chapter, sep, minor = chap.partition(".")
+ current[chap] = {
+ "title" : name.partition(":")[2].strip(),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor" : f"{sep}{minor}",
+ "chapter_string": chap,
+ "chapter_url" : f"{base}{path}",
+ "lang" : lang,
+ }
+
+ # extract all volumes
+ html = extr('class="volume-list-ul">', "</section>")
+ manga["_volumes"] = volumes = {}
+ for group in html.split('<div class="manga_list-wrap')[1:]:
+ lang = text.extr(group, ' id="', '-volumes"')
+
+ volumes[lang] = current = {}
+ lang = lang.partition("-")[0]
+ for vol in text.extract_iter(group, 'class="item">', "</div>"):
+ path = text.extr(vol, 'href="', '"')
+ voln = text.extr(vol, 'tick-vol">', '<').rpartition(" ")[2]
+
+ current[voln] = {
+ "volume" : text.parse_int(voln),
+ "volume_cover" : text.extr(vol, ' src="', '"'),
+ "chapter" : 0,
+ "chapter_minor" : "",
+ "chapter_string": voln,
+ "chapter_url" : f"{base}{path}",
+ "lang" : lang,
+ }
+
+ # extract remaining metadata
+ manga["description"] = text.unescape(extr(
+ 'class="description-modal">', "</div>")).strip()
+
+ return manga
diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py
index 5ff601a..42eaeef 100644
--- a/gallery_dl/extractor/misskey.py
+++ b/gallery_dl/extractor/misskey.py
@@ -25,8 +25,8 @@ class MisskeyExtractor(BaseExtractor):
def _init(self):
self.api = MisskeyAPI(self)
self.instance = self.root.rpartition("://")[2]
- self.renotes = self.config("renotes", False)
- self.replies = self.config("replies", True)
+ self.renotes = True if self.config("renotes", False) else False
+ self.replies = True if self.config("replies", True) else False
def items(self):
for note in self.notes():
@@ -254,6 +254,8 @@ class MisskeyAPI():
def _pagination(self, endpoint, data):
data["limit"] = 100
+ data["withRenotes"] = self.extractor.renotes
+
while True:
notes = self._call(endpoint, data)
if not notes:
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 21c361c..528aff2 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -173,7 +173,7 @@ class NozomiSearchExtractor(NozomiExtractor):
for tag in self.tags:
(negative if tag[0] == "-" else positive).append(
- tag.replace("/", ""))
+ text.quote(tag.replace("/", "")))
for tag in positive:
ids = nozomi("nozomi/" + tag)
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 5245f31..490243a 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -9,7 +9,7 @@
"""Extractors for https://rule34.paheal.net/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, exception
class PahealExtractor(Extractor):
@@ -97,7 +97,12 @@ class PahealTagExtractor(PahealExtractor):
base = f"{self.root}/post/list/{self.groups[0]}/"
while True:
- page = self.request(base + str(pnum)).text
+ try:
+ page = self.request(f"{base}{pnum}").text
+ except exception.HttpError as exc:
+ if exc.status == 404:
+ return
+ raise
pos = page.find("id='image-list'")
for post in text.extract_iter(
@@ -146,4 +151,9 @@ class PahealPostExtractor(PahealExtractor):
example = "https://rule34.paheal.net/post/view/12345"
def get_posts(self):
- return (self._extract_post(self.groups[0]),)
+ try:
+ return (self._extract_post(self.groups[0]),)
+ except exception.HttpError as exc:
+ if exc.status == 404:
+ return ()
+ raise
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index fb2f32c..cf1a6d6 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -230,6 +230,16 @@ class PatreonExtractor(Extractor):
attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z")
return attr
+ def _collection(self, collection_id):
+ url = f"{self.root}/api/collection/{collection_id}"
+ data = self.request_json(url)
+ coll = data["data"]
+ attr = coll["attributes"]
+ attr["id"] = coll["id"]
+ attr["date"] = text.parse_datetime(
+ attr["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ return attr
+
def _filename(self, url):
"""Fetch filename from an URL's Content-Disposition header"""
response = self.request(url, method="HEAD", fatal=False)
@@ -333,6 +343,33 @@ class PatreonExtractor(Extractor):
raise exception.AbortExtraction("Unable to extract bootstrap data")
+class PatreonCollectionExtractor(PatreonExtractor):
+ """Extractor for a patreon collection"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{creator[full_name]}",
+ "Collections", "{collection[title]} ({collection[id]})")
+ pattern = r"(?:https?://)?(?:www\.)?patreon\.com/collection/(\d+)"
+ example = "https://www.patreon.com/collection/12345"
+
+ def posts(self):
+ collection_id = self.groups[0]
+ self.kwdict["collection"] = collection = \
+ self._collection(collection_id)
+ campaign_id = text.extr(
+ collection["thumbnail"]["url"], "/campaign/", "/")
+
+ url = self._build_url("posts", (
+ # patreon returns '400 Bad Request' without campaign_id filter
+ f"&filter[campaign_id]={campaign_id}"
+ "&filter[contains_exclusive_posts]=true"
+ "&filter[is_draft]=false"
+ f"&filter[collection_id]={collection_id}"
+ "&filter[include_drops]=true"
+ "&sort=collection_order"
+ ))
+ return self._pagination(url)
+
+
class PatreonCreatorExtractor(PatreonExtractor):
"""Extractor for a creator's works"""
subcategory = "creator"
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index a72042c..6276a2a 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -1232,7 +1232,7 @@ class PixivAppAPI():
params = {"word": word, "search_target": target,
"sort": sort, "duration": duration,
"start_date": date_start, "end_date": date_end}
- return self._pagination("/v1/search/illust", params)
+ return self._pagination_search("/v1/search/illust", params)
def user_bookmarks_illust(self, user_id, tag=None, restrict="public"):
"""Return illusts bookmarked by a user"""
@@ -1322,6 +1322,48 @@ class PixivAppAPI():
params = text.parse_query(query)
data = self._call(endpoint, params)
+ def _pagination_search(self, endpoint, params):
+ sort = params["sort"]
+ if sort == "date_desc":
+ date_key = "end_date"
+ date_off = timedelta(days=1)
+ date_cmp = lambda lhs, rhs: lhs >= rhs # noqa E731
+ elif sort == "date_asc":
+ date_key = "start_date"
+ date_off = timedelta(days=-1)
+ date_cmp = lambda lhs, rhs: lhs <= rhs # noqa E731
+ else:
+ date_key = None
+ date_last = None
+
+ while True:
+ data = self._call(endpoint, params)
+
+ if date_last is None:
+ yield from data["illusts"]
+ else:
+ works = data["illusts"]
+ if date_cmp(date_last, works[-1]["create_date"]):
+ for work in works:
+ if date_last is None:
+ yield work
+ elif date_cmp(date_last, work["create_date"]):
+ date_last = None
+
+ if not (next_url := data.get("next_url")):
+ return
+ query = next_url.rpartition("?")[2]
+ params = text.parse_query(query)
+
+ if date_key and text.parse_int(params.get("offset")) >= 5000:
+ date_last = data["illusts"][-1]["create_date"]
+ date_val = (text.parse_datetime(
+ date_last) + date_off).strftime("%Y-%m-%d")
+ self.log.info("Reached 'offset' >= 5000; "
+ "Updating '%s' to '%s'", date_key, date_val)
+ params[date_key] = date_val
+ params.pop("offset", None)
+
@cache(maxage=36500*86400, keyarg=0)
def _refresh_token_cache(username):
diff --git a/gallery_dl/extractor/s3ndpics.py b/gallery_dl/extractor/s3ndpics.py
new file mode 100644
index 0000000..215f160
--- /dev/null
+++ b/gallery_dl/extractor/s3ndpics.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://s3nd.pics/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?s3nd\.pics"
+
+
+class S3ndpicsExtractor(Extractor):
+ """Base class for s3ndpics extractors"""
+ category = "s3ndpics"
+ root = "https://s3nd.pics"
+ root_api = f"{root}/api"
+ directory_fmt = ("{category}", "{user[username]}",
+ "{date} {title:?/ /}({id})")
+ filename_fmt = "{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+
+ def items(self):
+ base = "https://s3.s3nd.pics/s3nd-pics/"
+
+ for post in self.posts():
+ post["id"] = post.pop("_id", None)
+ post["user"] = post.pop("userId", None)
+ post["date"] = text.parse_datetime(
+ post["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["date_updated"] = text.parse_datetime(
+ post["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+ files = post.pop("files", ())
+ post["count"] = len(files)
+
+ yield Message.Directory, post
+ for post["num"], file in enumerate(files, 1):
+ post["type"] = file["type"]
+ path = file["url"]
+ text.nameext_from_url(path, post)
+ yield Message.Url, f"{base}{path}", post
+
+ def _pagination(self, url, params):
+ params["page"] = 1
+
+ while True:
+ data = self.request_json(url, params=params)
+
+ self.kwdict["total"] = data["pagination"]["total"]
+ yield from data["posts"]
+
+ if params["page"] >= data["pagination"]["pages"]:
+ return
+ params["page"] += 1
+
+
+class S3ndpicsPostExtractor(S3ndpicsExtractor):
+ subcategory = "post"
+ pattern = rf"{BASE_PATTERN}/post/([0-9a-f]+)"
+ example = "https://s3nd.pics/post/0123456789abcdef01234567"
+
+ def posts(self):
+ url = f"{self.root_api}/posts/{self.groups[0]}"
+ return (self.request_json(url)["post"],)
+
+
+class S3ndpicsUserExtractor(S3ndpicsExtractor):
+ subcategory = "user"
+ pattern = rf"{BASE_PATTERN}/user/(\w+)"
+ example = "https://s3nd.pics/user/USER"
+
+ def posts(self):
+ url = f"{self.root_api}/users/username/{self.groups[0]}"
+ self.kwdict["user"] = user = self.request_json(url)["user"]
+
+ url = f"{self.root_api}/posts"
+ params = {
+ "userId": user["_id"],
+ "limit" : "12",
+ "sortBy": "newest",
+ }
+ return self._pagination(url, params)
+
+
+class S3ndpicsSearchExtractor(S3ndpicsExtractor):
+ subcategory = "search"
+ pattern = rf"{BASE_PATTERN}/search/?\?([^#]+)"
+ example = "https://s3nd.pics/search?QUERY"
+
+ def posts(self):
+ url = f"{self.root_api}/posts"
+ params = text.parse_query(self.groups[0])
+ params.setdefault("limit", "20")
+ self.kwdict["search_tags"] = \
+ params.get("tag") or params.get("tags") or params.get("q")
+ return self._pagination(url, params)
diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py
index dc42417..a4ef3b0 100644
--- a/gallery_dl/extractor/schalenetwork.py
+++ b/gallery_dl/extractor/schalenetwork.py
@@ -62,10 +62,11 @@ class SchalenetworkExtractor(Extractor):
pass
params["page"] += 1
- def _token(self):
+ def _token(self, required=True):
if token := self.config("token"):
return f"Bearer {token.rpartition(' ')[2]}"
- raise exception.AuthRequired("'token'", "your favorites")
+ if required:
+ raise exception.AuthRequired("'token'", "your favorites")
def _crt(self):
crt = self.config("crt")
@@ -88,7 +89,7 @@ class SchalenetworkExtractor(Extractor):
else:
msg = f"{exc.status} {exc.response.reason}"
raise exception.AuthRequired(
- "'crt' query parameter & matching '--user-agent'", None, msg)
+ "'crt' query parameter & matching 'user-agent'", None, msg)
class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
@@ -114,19 +115,26 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
10: "mixed",
11: "language",
12: "other",
+ 13: "reclass",
}
def metadata(self, _):
_, gid, gkey = self.groups
+
url = f"{self.root_api}/books/detail/{gid}/{gkey}"
- data = self.request_json(url, headers=self.headers)
- data["date"] = text.parse_timestamp(data["created_at"] // 1000)
+ headers = self.headers
+ data = self.request_json(url, headers=headers)
+
+ try:
+ data["date"] = text.parse_timestamp(data["created_at"] // 1000)
+ data["count"] = len(data["thumbnails"]["entries"])
+ del data["thumbnails"]
+ except Exception:
+ pass
tags = []
types = self.TAG_TYPES
- tags_data = data["tags"]
-
- for tag in tags_data:
+ for tag in data["tags"]:
name = tag["name"]
namespace = tag.get("namespace", 0)
tags.append(types[namespace] + ":" + name)
@@ -134,33 +142,34 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
if self.config("tags", False):
tags = collections.defaultdict(list)
- for tag in tags_data :
+ for tag in data["tags"]:
tags[tag.get("namespace", 0)].append(tag["name"])
for type, values in tags.items():
data["tags_" + types[type]] = values
+ url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={self._crt()}"
+ if token := self._token(False):
+ headers = headers.copy()
+ headers["Authorization"] = token
try:
- data["count"] = len(data["thumbnails"]["entries"])
- del data["thumbnails"]
- except Exception:
- pass
+ data_fmt = self.request_json(
+ url, method="POST", headers=headers)
+ except exception.HttpError as exc:
+ self._require_auth(exc)
+
+ self.fmt = self._select_format(data_fmt["data"])
+ data["source"] = data_fmt.get("source")
return data
def images(self, _):
- crt = self._crt()
_, gid, gkey = self.groups
- url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={crt}"
- try:
- data = self.request_json(url, method="POST", headers=self.headers)
- except exception.HttpError as exc:
- self._require_auth(exc)
-
- fmt = self._select_format(data["data"])
+ fmt = self.fmt
url = (f"{self.root_api}/books/data/{gid}/{gkey}"
- f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={crt}")
- data = self.request_json(url, headers=self.headers)
+ f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={self._crt()}")
+ headers = self.headers
+ data = self.request_json(url, headers=headers)
base = data["base"]
results = []
@@ -169,7 +178,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
info = {
"width" : dimensions[0],
"height": dimensions[1],
- "_http_headers": self.headers,
+ "_http_headers": headers,
}
results.append((base + entry["path"], info))
return results
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py
index 3354289..d8227fa 100644
--- a/gallery_dl/extractor/simpcity.py
+++ b/gallery_dl/extractor/simpcity.py
@@ -92,7 +92,7 @@ class SimpcityExtractor(Extractor):
author = schema["author"]
stats = schema["interactionStatistic"]
url_t = schema["url"]
- url_a = author["url"]
+ url_a = author.get("url") or ""
thread = {
"id" : url_t[url_t.rfind(".")+1:-1],
@@ -104,8 +104,9 @@ class SimpcityExtractor(Extractor):
"tags" : (schema["keywords"].split(", ")
if "keywords" in schema else ()),
"section" : schema["articleSection"],
- "author" : author["name"],
- "author_id" : url_a[url_a.rfind(".")+1:-1],
+ "author" : author.get("name") or "",
+ "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
+ (author.get("name") or "")[15:]),
"author_url": url_a,
}
diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py
index 055d7d8..9a30654 100644
--- a/gallery_dl/extractor/thehentaiworld.py
+++ b/gallery_dl/extractor/thehentaiworld.py
@@ -60,14 +60,16 @@ class ThehentaiworldExtractor(Extractor):
"<li>Posted: ", "<"), "%Y-%m-%d"),
}
- if "/videos/" in url:
+ if (c := url[27]) == "v":
post["type"] = "video"
post["width"] = post["height"] = 0
post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
post["score"] = text.parse_float(extr("<strong>", "<"))
post["file_url"] = extr('<source src="', '"')
else:
- post["type"] = "image"
+ post["type"] = ("animated" if c == "g" else
+ "3d cgi" if c == "3" else
+ "image")
post["width"] = text.parse_int(extr("<li>Size: ", " "))
post["height"] = text.parse_int(extr("x ", "<"))
post["file_url"] = extr('a href="', '"')
@@ -109,16 +111,6 @@ class ThehentaiworldExtractor(Extractor):
pnum += 1
-class ThehentaiworldPostExtractor(ThehentaiworldExtractor):
- subcategory = "post"
- pattern = (rf"{BASE_PATTERN}"
- rf"(/(?:(?:3d-cgi-)?hentai-image|video)s/([^/?#]+))")
- example = "https://thehentaiworld.com/hentai-images/SLUG/"
-
- def posts(self):
- return (f"{self.root}{self.groups[0]}/",)
-
-
class ThehentaiworldTagExtractor(ThehentaiworldExtractor):
subcategory = "tag"
per_page = 24
@@ -137,3 +129,13 @@ class ThehentaiworldTagExtractor(ThehentaiworldExtractor):
self.page_start += pages
self.post_start += posts
return num
+
+
+class ThehentaiworldPostExtractor(ThehentaiworldExtractor):
+ subcategory = "post"
+ pattern = (rf"{BASE_PATTERN}("
+ rf"/(?:video|(?:[\w-]+-)?hentai-image)s/([^/?#]+))")
+ example = "https://thehentaiworld.com/hentai-images/SLUG/"
+
+ def posts(self):
+ return (f"{self.root}{self.groups[0]}/",)
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index e6c84d1..e7df4a3 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1026,11 +1026,12 @@ class TwitterTweetExtractor(TwitterExtractor):
return
while True:
+ parent_id = tweet["rest_id"]
tweet_id = tweet["legacy"].get("quoted_status_id_str")
if not tweet_id:
break
tweet = self.api.tweet_result_by_rest_id(tweet_id)
- tweet["legacy"]["quoted_by_id_str"] = tweet_id
+ tweet["legacy"]["quoted_by_id_str"] = parent_id
yield tweet
def _tweets_detail(self, tweet_id):
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 823e8e0..07bed79 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -86,16 +86,25 @@ class WeiboExtractor(Extractor):
status["count"] = len(files)
yield Message.Directory, status
- for num, file in enumerate(files, 1):
- if file["url"].startswith("http:"):
- file["url"] = "https:" + file["url"][5:]
+ num = 0
+ for file in files:
+ url = file["url"]
+ if not url:
+ continue
+ if url.startswith("http:"):
+ url = f"https:{url[5:]}"
if "filename" not in file:
- text.nameext_from_url(file["url"], file)
+ text.nameext_from_url(url, file)
if file["extension"] == "json":
file["extension"] = "mp4"
+ if file["extension"] == "m3u8":
+ url = f"ytdl:{url}"
+ file["_ytdl_manifest"] = "hls"
+ file["extension"] = "mp4"
+ num += 1
file["status"] = status
file["num"] = num
- yield Message.Url, file["url"], file
+ yield Message.Url, url, file
def _extract_status(self, status, files):
if "mix_media_info" in status:
@@ -143,10 +152,21 @@ class WeiboExtractor(Extractor):
media = max(info["playback_list"],
key=lambda m: m["meta"]["quality_index"])
except Exception:
- return {"url": (info.get("stream_url_hd") or
- info.get("stream_url") or "")}
+ video = {"url": (info.get("replay_hd") or
+ info.get("stream_url_hd") or
+ info.get("stream_url") or "")}
else:
- return media["play_info"].copy()
+ video = media["play_info"].copy()
+
+ if "//wblive-out." in video["url"] and \
+ not text.ext_from_url(video["url"]):
+ try:
+ video["url"] = self.request_location(video["url"])
+ except exception.HttpError as exc:
+ self.log.warning("%s: %s", exc.__class__.__name__, exc)
+ video["url"] = ""
+
+ return video
def _status_by_id(self, status_id):
url = f"{self.root}/ajax/statuses/show?id={status_id}"
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index 00266bd..5ba47d2 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -46,6 +46,12 @@ class WikimediaExtractor(BaseExtractor):
else:
self.api_url = None
+ # note: image revisions are different from page revisions
+ # ref:
+ # https://www.mediawiki.org/wiki/API:Revisions
+ # https://www.mediawiki.org/wiki/API:Imageinfo
+ self.image_revisions = self.config("image-revisions", 1)
+
@cache(maxage=36500*86400, keyarg=1)
def _search_api_path(self, root):
self.log.debug("Probing possible API endpoints")
@@ -56,7 +62,10 @@ class WikimediaExtractor(BaseExtractor):
return url
raise exception.AbortExtraction("Unable to find API endpoint")
- def prepare(self, image):
+ def prepare_info(self, info):
+ """Adjust the content of an image info object"""
+
+ def prepare_image(self, image):
"""Adjust the content of an image object"""
image["metadata"] = {
m["name"]: m["value"]
@@ -74,14 +83,19 @@ class WikimediaExtractor(BaseExtractor):
def items(self):
for info in self._pagination(self.params):
try:
- image = info["imageinfo"][0]
- except LookupError:
+ images = info.pop("imageinfo")
+ except KeyError:
self.log.debug("Missing 'imageinfo' for %s", info)
- continue
+ images = ()
+
+ info["count"] = len(images)
+ self.prepare_info(info)
+ yield Message.Directory, info
- self.prepare(image)
- yield Message.Directory, image
- yield Message.Url, image["url"], image
+ for info["num"], image in enumerate(images, 1):
+ self.prepare_image(image)
+ image.update(info)
+ yield Message.Url, image["url"], image
if self.subcategories:
base = self.root + "/wiki/"
@@ -108,6 +122,7 @@ class WikimediaExtractor(BaseExtractor):
"timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
)
+ params["iilimit"] = self.image_revisions
while True:
data = self.request_json(url, params=params)
@@ -237,9 +252,8 @@ class WikimediaArticleExtractor(WikimediaExtractor):
"titles" : path,
}
- def prepare(self, image):
- WikimediaExtractor.prepare(self, image)
- image["page"] = self.title
+ def prepare_info(self, info):
+ info["page"] = self.title
class WikimediaWikiExtractor(WikimediaExtractor):
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index e1b4897..98c9331 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -26,6 +26,7 @@ class ZerochanExtractor(BooruExtractor):
per_page = 250
cookies_domain = ".zerochan.net"
cookies_names = ("z_id", "z_hash")
+ useragent = util.USERAGENT
request_interval = (0.5, 1.5)
def login(self):
@@ -192,7 +193,13 @@ class ZerochanTagExtractor(ZerochanExtractor):
metadata = self.config("metadata")
while True:
- page = self.request(url, params=params, expected=(500,)).text
+ try:
+ page = self.request(
+ url, params=params, expected=(500,)).text
+ except exception.HttpError as exc:
+ if exc.status == 404:
+ return
+ raise
thumbs = text.extr(page, '<ul id="thumbs', '</ul>')
extr = text.extract_from(thumbs)
@@ -231,7 +238,13 @@ class ZerochanTagExtractor(ZerochanExtractor):
}
while True:
- response = self.request(url, params=params, allow_redirects=False)
+ try:
+ response = self.request(
+ url, params=params, allow_redirects=False)
+ except exception.HttpError as exc:
+ if exc.status == 404:
+ return
+ raise
if response.status_code >= 300:
url = text.urljoin(self.root, response.headers["location"])
@@ -275,12 +288,18 @@ class ZerochanImageExtractor(ZerochanExtractor):
pattern = BASE_PATTERN + r"/(\d+)"
example = "https://www.zerochan.net/12345"
- def __init__(self, match):
- ZerochanExtractor.__init__(self, match)
- self.image_id = match[1]
-
def posts(self):
- post = self._parse_entry_html(self.image_id)
+ image_id = self.groups[0]
+
+ try:
+ post = self._parse_entry_html(image_id)
+ except exception.HttpError as exc:
+ if exc.status in (404, 410):
+ if msg := text.extr(exc.response.text, "<h2>", "<"):
+ self.log.warning(f"'{msg}'")
+ return ()
+ raise
+
if self.config("metadata"):
- post.update(self._parse_entry_api(self.image_id))
+ post.update(self._parse_entry_api(image_id))
return (post,)