summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/extractor/2ch.py31
-rw-r--r--gallery_dl/extractor/4archive.py6
-rw-r--r--gallery_dl/extractor/__init__.py4
-rw-r--r--gallery_dl/extractor/bellazon.py85
-rw-r--r--gallery_dl/extractor/bunkr.py2
-rw-r--r--gallery_dl/extractor/chevereto.py16
-rw-r--r--gallery_dl/extractor/danbooru.py17
-rw-r--r--gallery_dl/extractor/facebook.py10
-rw-r--r--gallery_dl/extractor/hdoujin.py42
-rw-r--r--gallery_dl/extractor/imgpile.py119
-rw-r--r--gallery_dl/extractor/instagram.py12
-rw-r--r--gallery_dl/extractor/iwara.py50
-rw-r--r--gallery_dl/extractor/kemono.py28
-rw-r--r--gallery_dl/extractor/lensdump.py3
-rw-r--r--gallery_dl/extractor/mangadex.py69
-rw-r--r--gallery_dl/extractor/mangataro.py105
-rw-r--r--gallery_dl/extractor/pinterest.py104
-rw-r--r--gallery_dl/extractor/reddit.py36
-rw-r--r--gallery_dl/extractor/schalenetwork.py149
-rw-r--r--gallery_dl/extractor/simpcity.py56
-rw-r--r--gallery_dl/extractor/thehentaiworld.py139
-rw-r--r--gallery_dl/extractor/twitter.py2
-rw-r--r--gallery_dl/extractor/vipergirls.py12
-rw-r--r--gallery_dl/job.py6
-rw-r--r--gallery_dl/postprocessor/common.py4
-rw-r--r--gallery_dl/postprocessor/metadata.py9
-rw-r--r--gallery_dl/postprocessor/python.py20
-rw-r--r--gallery_dl/util.py13
-rw-r--r--gallery_dl/version.py2
-rw-r--r--gallery_dl/ytdl.py18
30 files changed, 933 insertions, 236 deletions
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
index f5bb7b7..912a251 100644
--- a/gallery_dl/extractor/2ch.py
+++ b/gallery_dl/extractor/2ch.py
@@ -4,37 +4,41 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://2ch.hk/"""
+"""Extractors for https://2ch.su/"""
from .common import Extractor, Message
from .. import text, util
+BASE_PATTERN = r"(?:https?://)?2ch\.(su|life|hk)"
+
class _2chThreadExtractor(Extractor):
"""Extractor for 2ch threads"""
category = "2ch"
subcategory = "thread"
- root = "https://2ch.hk"
+ root = "https://2ch.su"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{tim}{filename:? //}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
- pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
- example = "https://2ch.hk/a/res/12345.html"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/res/(\d+)"
+ example = "https://2ch.su/a/res/12345.html"
def __init__(self, match):
+ tld = match[1]
+ self.root = f"https://2ch.{'su' if tld == 'hk' else tld}"
Extractor.__init__(self, match)
- self.board, self.thread = match.groups()
def items(self):
- url = f"{self.root}/{self.board}/res/{self.thread}.json"
+ _, board, thread = self.groups
+ url = f"{self.root}/{board}/res/{thread}.json"
posts = self.request_json(url)["threads"][0]["posts"]
op = posts[0]
title = op.get("subject") or text.remove_html(op["comment"])
thread = {
- "board" : self.board,
- "thread": self.thread,
+ "board" : board,
+ "thread": thread,
"title" : text.unescape(title)[:50],
}
@@ -61,16 +65,17 @@ class _2chBoardExtractor(Extractor):
"""Extractor for 2ch boards"""
category = "2ch"
subcategory = "board"
- root = "https://2ch.hk"
- pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
- example = "https://2ch.hk/a/"
+ root = "https://2ch.su"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/?$"
+ example = "https://2ch.su/a/"
def __init__(self, match):
+ tld = match[1]
+ self.root = f"https://2ch.{'su' if tld == 'hk' else tld}"
Extractor.__init__(self, match)
- self.board = match[1]
def items(self):
- base = f"{self.root}/{self.board}"
+ base = f"{self.root}/{self.groups[1]}"
# index page
url = f"{base}/index.json"
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py
index c9be2a4..4c43464 100644
--- a/gallery_dl/extractor/4archive.py
+++ b/gallery_dl/extractor/4archive.py
@@ -62,7 +62,8 @@ class _4archiveThreadExtractor(Extractor):
data = {
"name": extr('class="name">', "</span>"),
"date": text.parse_datetime(
- extr('class="dateTime postNum" >', "<").strip(),
+ (extr('class="dateTime">', "<") or
+ extr('class="dateTime postNum" >', "<")).strip(),
"%Y-%m-%d %H:%M:%S"),
"no" : text.parse_int(extr(">Post No.", "<")),
}
@@ -70,8 +71,7 @@ class _4archiveThreadExtractor(Extractor):
extr('class="fileText"', ">File: <a")
data.update({
"url" : extr('href="', '"'),
- "filename": extr(
- 'rel="noreferrer noopener"', "</a>").strip()[1:],
+ "filename": extr('alt="Image: ', '"'),
"size" : text.parse_bytes(extr(" (", ", ")[:-1]),
"width" : text.parse_int(extr("", "x")),
"height" : text.parse_int(extr("", "px")),
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index b32fcd1..abdb6cc 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -73,6 +73,7 @@ modules = [
"girlswithmuscle",
"gofile",
"hatenablog",
+ "hdoujin",
"hentai2read",
"hentaicosplays",
"hentaifoundry",
@@ -88,6 +89,7 @@ modules = [
"imagefap",
"imgbb",
"imgbox",
+ "imgpile",
"imgth",
"imgur",
"imhentai",
@@ -118,6 +120,7 @@ modules = [
"manganelo",
"mangapark",
"mangaread",
+ "mangataro",
"mangoxo",
"misskey",
"motherless",
@@ -188,6 +191,7 @@ modules = [
"tcbscans",
"telegraph",
"tenor",
+ "thehentaiworld",
"tiktok",
"tmohentai",
"toyhouse",
diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py
index 5c9b9cd..5dcb6a5 100644
--- a/gallery_dl/extractor/bellazon.py
+++ b/gallery_dl/extractor/bellazon.py
@@ -20,32 +20,61 @@ class BellazonExtractor(Extractor):
root = "https://www.bellazon.com/main"
directory_fmt = ("{category}", "{thread[section]}",
"{thread[title]} ({thread[id]})")
- filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}"
- archive_fmt = "{post[id]}/{filename}"
+ filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
+ archive_fmt = "{post[id]}/{id}_{filename}"
def items(self):
- extract_urls = text.re(r'<a ([^>]*?href="([^"]+)".*?)</a>').findall
- native = f"{self.root}/"
+ native = (f"{self.root}/", f"{self.root[6:]}/")
+ extract_urls = text.re(
+ r'(?s)<('
+ r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>'
+ r'|img [^>]*?src="([^"]+)"[^>]*>'
+ r')'
+ ).findall
+
+ if self.config("quoted", False):
+ strip_quoted = None
+ else:
+ strip_quoted = text.re(r"(?s)<blockquote .*?</blockquote>").sub
for post in self.posts():
- urls = extract_urls(post["content"])
+ if strip_quoted is None:
+ urls = extract_urls(post["content"])
+ else:
+ urls = extract_urls(strip_quoted("", post["content"]))
+
data = {"post": post}
post["count"] = data["count"] = len(urls)
yield Message.Directory, data
- for data["num"], (info, url) in enumerate(urls, 1):
- url = text.unescape(url)
+ data["num"] = 0
+ for info, url, url_img in urls:
+ url = text.unescape(url or url_img)
+
if url.startswith(native):
+ if "/uploads/emoticons/" in url or "/profile/" in url:
+ continue
+ data["num"] += 1
if not (alt := text.extr(info, ' alt="', '"')) or (
alt.startswith("post-") and "_thumb." in alt):
name = url
else:
name = text.unescape(alt)
+
dc = text.nameext_from_url(name, data.copy())
dc["id"] = text.extr(info, 'data-fileid="', '"')
if ext := text.extr(info, 'data-fileext="', '"'):
dc["extension"] = ext
+ elif "/core/interface/file/attachment.php" in url:
+ if not dc["id"]:
+ dc["id"] = url.rpartition("?id=")[2]
+ if name := text.extr(info, ">", "<").strip():
+ text.nameext_from_url(name, dc)
+
+ if url[0] == "/":
+ url = f"https:{url}"
yield Message.Url, url, dc
+
else:
yield Message.Queue, url, data
@@ -70,6 +99,28 @@ class BellazonExtractor(Extractor):
pnum += 1
url = f"{base}/page/{pnum}/"
+ def _pagination_reverse(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ url = f"{base}/page/9999/" # force redirect to highest page number
+ with self.request(url) as response:
+ parts = response.url.rsplit("/", 3)
+ pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1
+ page = response.text
+
+ while True:
+ yield page
+
+ pnum -= 1
+ if pnum > 1:
+ url = f"{base}/page/{pnum}/"
+ elif pnum == 1:
+ url = f"{base}/"
+ else:
+ return
+
+ page = self.request(url).text
+
def _parse_thread(self, page):
schema = self._extract_jsonld(page)
author = schema["author"]
@@ -88,7 +139,7 @@ class BellazonExtractor(Extractor):
"posts": stats[1]["userInteractionCount"],
"date" : text.parse_datetime(schema["datePublished"]),
"date_updated": text.parse_datetime(schema["dateModified"]),
- "description" : text.unescape(schema["text"]),
+ "description" : text.unescape(schema["text"]).strip(),
"section" : path[-2],
"author" : author["name"],
"author_url" : url_a,
@@ -123,7 +174,7 @@ class BellazonExtractor(Extractor):
class BellazonPostExtractor(BellazonExtractor):
subcategory = "post"
pattern = (rf"{BASE_PATTERN}(/topic/\d+-[\w-]+(?:/page/\d+)?)"
- rf"/?#findComment-(\d+)")
+ rf"/?#(?:findC|c)omment-(\d+)")
example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345"
def posts(self):
@@ -145,10 +196,22 @@ class BellazonThreadExtractor(BellazonExtractor):
example = "https://www.bellazon.com/main/topic/123-SLUG/"
def posts(self):
- for page in self._pagination(*self.groups):
+ if (order := self.config("order-posts")) and \
+ order[0] not in ("d", "r"):
+ pages = self._pagination(*self.groups)
+ reverse = False
+ else:
+ pages = self._pagination_reverse(*self.groups)
+ reverse = True
+
+ for page in pages:
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)
- for html in text.extract_iter(page, "<article ", "</article>"):
+ posts = text.extract_iter(page, "<article ", "</article>")
+ if reverse:
+ posts = list(posts)
+ posts.reverse()
+ for html in posts:
yield self._parse_post(html)
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index cf5bce1..14ebc48 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -162,7 +162,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
file["name"] = util.json_loads(text.extr(
item, 'original:', ',\n').replace("\\'", "'"))
file["slug"] = util.json_loads(text.extr(
- item, 'slug: ', ',\n'))
+ item, 'slug: ', ',\n').replace("\\'", "'"))
file["uuid"] = text.extr(
item, 'name: "', ".")
file["size"] = text.parse_int(text.extr(
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index 6ba4d08..67fdb39 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -50,6 +50,10 @@ BASE_PATTERN = CheveretoExtractor.update({
"root": "https://imagepond.net",
"pattern": r"imagepond\.net",
},
+ "imglike": {
+ "root": "https://imglike.com",
+ "pattern": r"imglike\.com",
+ },
})
@@ -152,6 +156,18 @@ class CheveretoAlbumExtractor(CheveretoExtractor):
yield Message.Queue, image, data
+class CheveretoCategoryExtractor(CheveretoExtractor):
+ """Extractor for chevereto galleries"""
+ subcategory = "category"
+ pattern = BASE_PATTERN + r"(/category/[^/?#]+)"
+ example = "https://imglike.com/category/TITLE"
+
+ def items(self):
+ data = {"_extractor": CheveretoImageExtractor}
+ for image in self._pagination(self.root + self.path):
+ yield Message.Queue, image, data
+
+
class CheveretoUserExtractor(CheveretoExtractor):
"""Extractor for chevereto users"""
subcategory = "user"
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index f8ad07a..29c7763 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -278,6 +278,23 @@ class DanbooruTagExtractor(DanbooruExtractor):
return self._pagination("/posts.json", {"tags": self.tags}, prefix)
+class DanbooruRandomExtractor(DanbooruTagExtractor):
+ """Extractor for a random danbooru post"""
+ subcategory = "random"
+ pattern = BASE_PATTERN + r"/posts/random(?:\?(?:[^&#]*&)*tags=([^&#]*))?"
+ example = "https://danbooru.donmai.us/posts/random?tags=TAG"
+
+ def metadata(self):
+ tags = self.groups[-1] or ""
+ self.tags = text.unquote(tags.replace("+", " "))
+ return {"search_tags": self.tags}
+
+ def posts(self):
+ posts = self.request_json(self.root + "/posts/random.json",
+ params={"tags": self.tags or None})
+ return (posts,) if isinstance(posts, dict) else posts
+
+
class DanbooruPoolExtractor(DanbooruExtractor):
"""Extractor for Danbooru pools"""
subcategory = "pool"
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
index bf24941..6061737 100644
--- a/gallery_dl/extractor/facebook.py
+++ b/gallery_dl/extractor/facebook.py
@@ -369,6 +369,16 @@ class FacebookExtractor(Extractor):
for edge in (user["profile_tabs"]["profile_user"]
["timeline_nav_app_sections"]["edges"])
]
+
+ if bio := text.extr(page, '"best_description":{"text":"', '"'):
+ user["biography"] = self.decode_all(bio)
+ elif (pos := page.find(
+ '"__module_operation_ProfileCometTileView_profileT')) >= 0:
+ user["biography"] = self.decode_all(text.rextr(
+ page, '"text":"', '"', pos))
+ else:
+ user["biography"] = text.unescape(text.remove_html(text.extr(
+ page, "</span></span></h2>", "<ul>")))
except Exception:
if user is None:
self.log.debug("Failed to extract user data: %s", data)
diff --git a/gallery_dl/extractor/hdoujin.py b/gallery_dl/extractor/hdoujin.py
new file mode 100644
index 0000000..080b899
--- /dev/null
+++ b/gallery_dl/extractor/hdoujin.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hdoujin.org/"""
+
+from . import schalenetwork
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?(hdoujin\.(?:org|net))"
+
+
+class HdoujinBase():
+ """Base class for hdoujin extractors"""
+ category = "hdoujin"
+ root = "https://hdoujin.org"
+ root_api = "https://api.hdoujin.org"
+ root_auth = "https://auth.hdoujin.org"
+
+
+class HdoujinGalleryExtractor(
+ HdoujinBase, schalenetwork.SchalenetworkGalleryExtractor):
+ pattern = rf"{BASE_PATTERN}/(?:g|reader)/(\d+)/(\w+)"
+ example = "https://hdoujin.org/g/12345/67890abcdef/"
+
+
+class HdoujinSearchExtractor(
+ HdoujinBase, schalenetwork.SchalenetworkSearchExtractor):
+ pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+)|browse)?(?:/?\?([^#]*))?$"
+ example = "https://hdoujin.org/browse?s=QUERY"
+
+
+class HdoujinFavoriteExtractor(
+ HdoujinBase, schalenetwork.SchalenetworkFavoriteExtractor):
+ pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?"
+ example = "https://hdoujin.org/favorites"
+
+
+HdoujinBase.extr_class = HdoujinGalleryExtractor
diff --git a/gallery_dl/extractor/imgpile.py b/gallery_dl/extractor/imgpile.py
new file mode 100644
index 0000000..9fc3a9c
--- /dev/null
+++ b/gallery_dl/extractor/imgpile.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://imgpile.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgpile\.com"
+
+
+class ImgpileExtractor(Extractor):
+ """Base class for imgpile extractors"""
+ category = "imgpile"
+ root = "https://imgpile.com"
+ directory_fmt = ("{category}", "{post[author]}",
+ "{post[title]} ({post[id_slug]})")
+ archive_fmt = "{post[id_slug]}_{id}"
+
+ def items(self):
+ pass
+
+
+class ImgpilePostExtractor(ImgpileExtractor):
+ subcategory = "post"
+ pattern = rf"{BASE_PATTERN}/p/(\w+)"
+ example = "https://imgpile.com/p/AbCdEfG"
+
+ def items(self):
+ post_id = self.groups[0]
+ url = f"{self.root}/p/{post_id}"
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ post = {
+ "id_slug": post_id,
+ "title" : text.unescape(extr("<title>", " - imgpile<")),
+ "id" : text.parse_int(extr('data-post-id="', '"')),
+ "author" : extr('/u/', '"'),
+ "score" : text.parse_int(text.remove_html(extr(
+ 'class="post-score">', "</"))),
+ "views" : text.parse_int(extr(
+ 'class="meta-value">', "<").replace(",", "")),
+ "tags" : text.split_html(extr(
+ " <!-- Tags -->", '<!-- "')),
+ }
+
+ files = self._extract_files(extr)
+ data = {"post": post}
+ data["count"] = post["count"] = len(files)
+
+ yield Message.Directory, data
+ for data["num"], file in enumerate(files, 1):
+ data.update(file)
+ url = file["url"]
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def _extract_files(self, extr):
+ files = []
+
+ while True:
+ media = extr('lass="post-media', '</div>')
+ if not media:
+ break
+ files.append({
+ "id_slug": text.extr(media, 'data-id="', '"'),
+ "id" : text.parse_int(text.extr(
+ media, 'data-media-id="', '"')),
+ "url": f"""http{text.extr(media, '<a href="http', '"')}""",
+ })
+ return files
+
+
+class ImgpileUserExtractor(ImgpileExtractor):
+ subcategory = "user"
+ pattern = rf"{BASE_PATTERN}/u/([^/?#]+)"
+ example = "https://imgpile.com/u/USER"
+
+ def items(self):
+ url = f"{self.root}/api/v1/posts"
+ params = {
+ "limit" : "100",
+ "sort" : "latest",
+ "period" : "all",
+ "visibility": "public",
+ # "moderation_status": "approved",
+ "username" : self.groups[0],
+ }
+ headers = {
+ "Accept" : "application/json",
+ # "Referer" : "https://imgpile.com/u/USER",
+ "Content-Type" : "application/json",
+ # "X-CSRF-TOKEN": "",
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-origin",
+ }
+
+ base = f"{self.root}/p/"
+ while True:
+ data = self.request_json(url, params=params, headers=headers)
+
+ if params is not None:
+ params = None
+ self.kwdict["total"] = data["meta"]["total"]
+
+ for item in data["data"]:
+ item["_extractor"] = ImgpilePostExtractor
+ url = f"{base}{item['slug']}"
+ yield Message.Queue, url, item
+
+ url = data["links"].get("next")
+ if not url:
+ return
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 9b8f8c9..00e06b5 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -95,7 +95,7 @@ class InstagramExtractor(Extractor):
if videos:
file["_http_headers"] = videos_headers
text.nameext_from_url(url, file)
- if videos_dash:
+ if videos_dash and "_ytdl_manifest_data" in post:
file["_fallback"] = (url,)
file["_ytdl_manifest"] = "dash"
url = f"ytdl:{post['post_url']}{file['num']}.mp4"
@@ -505,10 +505,12 @@ class InstagramTaggedExtractor(InstagramExtractor):
def metadata(self):
if self.item.startswith("id:"):
self.user_id = self.item[3:]
- return {"tagged_owner_id": self.user_id}
-
- self.user_id = self.api.user_id(self.item)
- user = self.api.user_by_name(self.item)
+ if not self.config("metadata"):
+ return {"tagged_owner_id": self.user_id}
+ user = self.api.user_by_id(self.user_id)
+ else:
+ self.user_id = self.api.user_id(self.item)
+ user = self.api.user_by_name(self.item)
return {
"tagged_owner_id" : user["id"],
diff --git a/gallery_dl/extractor/iwara.py b/gallery_dl/extractor/iwara.py
index 179909b..8af2f42 100644
--- a/gallery_dl/extractor/iwara.py
+++ b/gallery_dl/extractor/iwara.py
@@ -45,6 +45,7 @@ class IwaraExtractor(Extractor):
image["id"], exc.__class__.__name__, exc)
continue
+ group_info["type"] = "image"
group_info["count"] = len(files)
yield Message.Directory, group_info
for num, file in enumerate(files, 1):
@@ -102,34 +103,37 @@ class IwaraExtractor(Extractor):
raise exception.AbortExtraction(f"Unsupported result type '{type}'")
def extract_media_info(self, item, key, include_file_info=True):
- title = t.strip() if (t := item.get("title")) else ""
+ info = {
+ "id" : item["id"],
+ "slug" : item.get("slug"),
+ "rating" : item.get("rating"),
+ "likes" : item.get("numLikes"),
+ "views" : item.get("numViews"),
+ "comments": item.get("numComments"),
+ "tags" : [t["id"] for t in item.get("tags") or ()],
+ "title" : t.strip() if (t := item.get("title")) else "",
+ "description": t.strip() if (t := item.get("body")) else "",
+ }
if include_file_info:
file_info = item if key is None else item.get(key) or {}
filename, _, extension = file_info.get("name", "").rpartition(".")
- return {
- "id" : item["id"],
- "file_id" : file_info.get("id"),
- "title" : title,
- "filename" : filename,
- "extension": extension,
- "date" : text.parse_datetime(
- file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ"),
- "date_updated": text.parse_datetime(
- file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ"),
- "mime" : file_info.get("mime"),
- "size" : file_info.get("size"),
- "width" : file_info.get("width"),
- "height" : file_info.get("height"),
- "duration" : file_info.get("duration"),
- "type" : file_info.get("type"),
- }
- else:
- return {
- "id" : item["id"],
- "title": title,
- }
+ info["file_id"] = file_info.get("id")
+ info["filename"] = filename
+ info["extension"] = extension
+ info["date"] = text.parse_datetime(
+ file_info.get("createdAt"), "%Y-%m-%dT%H:%M:%S.%fZ")
+ info["date_updated"] = text.parse_datetime(
+ file_info.get("updatedAt"), "%Y-%m-%dT%H:%M:%S.%fZ")
+ info["mime"] = file_info.get("mime")
+ info["size"] = file_info.get("size")
+ info["width"] = file_info.get("width")
+ info["height"] = file_info.get("height")
+ info["duration"] = file_info.get("duration")
+ info["type"] = file_info.get("type")
+
+ return info
def extract_user_info(self, profile):
user = profile.get("user") or {}
diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py
index fc5972c..1f70031 100644
--- a/gallery_dl/extractor/kemono.py
+++ b/gallery_dl/extractor/kemono.py
@@ -407,7 +407,11 @@ class KemonoDiscordExtractor(KemonoExtractor):
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
find_hash = util.re(HASH_PATTERN).match
- posts = self.api.discord_channel(channel_id)
+ if (order := self.config("order-posts")) and order[0] in ("r", "d"):
+ posts = self.api.discord_channel(channel_id, channel["post_count"])
+ else:
+ posts = self.api.discord_channel(channel_id)
+
if max_posts := self.config("max-posts"):
posts = itertools.islice(posts, max_posts)
@@ -627,9 +631,12 @@ class KemonoAPI():
endpoint = f"/{service}/user/{creator_id}/tags"
return self._call(endpoint)
- def discord_channel(self, channel_id):
+ def discord_channel(self, channel_id, post_count=None):
endpoint = f"/discord/channel/{channel_id}"
- return self._pagination(endpoint, {}, 150)
+ if post_count is None:
+ return self._pagination(endpoint, {}, 150)
+ else:
+ return self._pagination_reverse(endpoint, {}, 150, post_count)
def discord_channel_lookup(self, server_id):
endpoint = f"/discord/channel/lookup/{server_id}"
@@ -670,3 +677,18 @@ class KemonoAPI():
if len(data) < batch:
return
params["o"] += batch
+
+ def _pagination_reverse(self, endpoint, params, batch, count):
+ params["o"] = count // batch * batch
+
+ while True:
+ data = self._call(endpoint, params)
+
+ if not data:
+ return
+ data.reverse()
+ yield from data
+
+ if not params["o"]:
+ return
+ params["o"] -= batch
diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py
index c700a29..b0198d5 100644
--- a/gallery_dl/extractor/lensdump.py
+++ b/gallery_dl/extractor/lensdump.py
@@ -100,7 +100,8 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
filename_fmt = "{category}_{id}{title:?_//}.{extension}"
directory_fmt = ("{category}",)
archive_fmt = "{id}"
- pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)"
+ pattern = (r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)"
+ r"/(?:i/)?(\w+)")
example = "https://lensdump.com/i/ID"
def items(self):
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 225560d..fbed328 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -96,6 +96,57 @@ class MangadexExtractor(Extractor):
return data
+class MangadexCoversExtractor(MangadexExtractor):
+ """Extractor for mangadex manga covers"""
+ subcategory = "covers"
+ directory_fmt = ("{category}", "{manga}", "Covers")
+ filename_fmt = "{volume:>02}_{lang}.{extension}"
+ archive_fmt = "c_{cover_id}"
+ pattern = (rf"{BASE_PATTERN}/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)"
+ r"(?:/[^/?#]+)?\?tab=art")
+ example = ("https://mangadex.org/title"
+ "/01234567-89ab-cdef-0123-456789abcdef?tab=art")
+
+ def items(self):
+ base = f"{self.root}/covers/{self.uuid}/"
+ for cover in self.api.covers_manga(self.uuid):
+ data = self._transform_cover(cover)
+ name = data["cover"]
+ text.nameext_from_url(name, data)
+ data["cover_id"] = data["filename"]
+ yield Message.Directory, data
+ yield Message.Url, f"{base}{name}", data
+
+ def _transform_cover(self, cover):
+ relationships = defaultdict(list)
+ for item in cover["relationships"]:
+ relationships[item["type"]].append(item)
+ manga = self.api.manga(relationships["manga"][0]["id"])
+ for item in manga["relationships"]:
+ relationships[item["type"]].append(item)
+
+ cattributes = cover["attributes"]
+ mattributes = manga["attributes"]
+
+ return {
+ "manga" : (mattributes["title"].get("en") or
+ next(iter(mattributes["title"].values()))),
+ "manga_id": manga["id"],
+ "status" : mattributes["status"],
+ "author" : [author["attributes"]["name"]
+ for author in relationships["author"]],
+ "artist" : [artist["attributes"]["name"]
+ for artist in relationships["artist"]],
+ "tags" : [tag["attributes"]["name"]["en"]
+ for tag in mattributes["tags"]],
+ "cover" : cattributes["fileName"],
+ "lang" : cattributes.get("locale"),
+ "volume" : text.parse_int(cattributes["volume"]),
+ "date" : text.parse_datetime(cattributes["createdAt"]),
+ "date_updated": text.parse_datetime(cattributes["updatedAt"]),
+ }
+
+
class MangadexChapterExtractor(MangadexExtractor):
"""Extractor for manga-chapters from mangadex.org"""
subcategory = "chapter"
@@ -239,6 +290,10 @@ class MangadexAPI():
params = {"includes[]": ("scanlation_group",)}
return self._call("/chapter/" + uuid, params)["data"]
+ def covers_manga(self, uuid):
+ params = {"manga[]": uuid}
+ return self._pagination_covers("/cover", params)
+
def list(self, uuid):
return self._call("/list/" + uuid, None, True)["data"]
@@ -374,6 +429,20 @@ class MangadexAPI():
return self._pagination(endpoint, params, auth)
+ def _pagination_covers(self, endpoint, params=None, auth=False):
+ if params is None:
+ params = {}
+
+ lang = self.extractor.config("lang")
+ if isinstance(lang, str) and "," in lang:
+ lang = lang.split(",")
+ params["locales"] = lang
+ params["contentRating"] = None
+ params["order[volume]"] = \
+ "desc" if self.extractor.config("chapter-reverse") else "asc"
+
+ return self._pagination(endpoint, params, auth)
+
def _pagination(self, endpoint, params, auth=False):
config = self.extractor.config
diff --git a/gallery_dl/extractor/mangataro.py b/gallery_dl/extractor/mangataro.py
new file mode 100644
index 0000000..f4cc058
--- /dev/null
+++ b/gallery_dl/extractor/mangataro.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://mangataro.org/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+from ..cache import memcache
+
+BASE_PATTERN = r"(?:https?://)?mangataro\.org"
+
+
+class MangataroBase():
+ """Base class for mangataro extractors"""
+ category = "mangataro"
+ root = "https://mangataro.org"
+
+
+class MangataroChapterExtractor(MangataroBase, ChapterExtractor):
+ """Extractor for mangataro manga chapters"""
+ pattern = rf"{BASE_PATTERN}(/read/([^/?#]+)/(?:[^/?#]*-)?(\d+))"
+ example = "https://mangataro.org/read/MANGA/ch123-12345"
+
+ def metadata(self, page):
+ _, slug, chapter_id = self.groups
+ comic = self._extract_jsonld(page)["@graph"][0]
+ chapter = comic["position"]
+ minor = chapter - int(chapter)
+ desc = comic["description"].split(" - ", 3)
+
+ return {
+ **_manga_info(self, slug),
+ "title" : desc[1] if len(desc) > 3 else "",
+ "chapter" : int(chapter),
+ "chapter_minor": str(round(minor, 5))[1:] if minor else "",
+ "chapter_id" : text.parse_int(chapter_id),
+ "chapter_url" : comic["url"],
+ "date" : text.parse_datetime(
+ comic["datePublished"], "%Y-%m-%dT%H:%M:%S%z"),
+ "date_updated" : text.parse_datetime(
+ comic["dateModified"], "%Y-%m-%dT%H:%M:%S%z"),
+ }
+
+ def images(self, page):
+ pos = page.find('class="comic-image-container')
+ img, pos = text.extract(page, ' src="', '"', pos)
+
+ images = [(img, None)]
+ images.extend(
+ (url, None)
+ for url in text.extract_iter(page, 'data-src="', '"', pos)
+ )
+ return images
+
+
+class MangataroMangaExtractor(MangataroBase, MangaExtractor):
+ """Extractor for mangataro manga"""
+ chapterclass = MangataroChapterExtractor
+ pattern = rf"{BASE_PATTERN}(/manga/([^/?#]+))"
+ example = "https://mangataro.org/manga/MANGA"
+
+ def chapters(self, page):
+ slug = self.groups[1]
+ manga = _manga_info(self, slug)
+
+ results = []
+ for url in text.extract_iter(text.extr(
+ page, '<div class="chapter-list', '<div id="tab-gallery"'),
+ '<a href="', '"'):
+ chapter, _, chapter_id = url[url.rfind("/")+3:].rpartition("-")
+ chapter, sep, minor = chapter.partition("-")
+ results.append((url, {
+ **manga,
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": f".{minor}" if sep else "",
+ "chapter_id" : text.parse_int(chapter_id),
+ }))
+ return results
+
+
+@memcache(keyarg=1)
+def _manga_info(self, slug):
+ url = f"{self.root}/manga/{slug}"
+ page = self.request(url).text
+ manga = self._extract_jsonld(page)
+
+ return {
+ "manga" : manga["name"].rpartition(" | ")[0].rpartition(" ")[0],
+ "manga_url" : manga["url"],
+ "cover" : manga["image"],
+ "author" : manga["author"]["name"].split(", "),
+ "genre" : manga["genre"],
+ "status" : manga["status"],
+ "description": text.unescape(text.extr(
+ page, 'id="description-content-tab">', "</div></div>")),
+ "tags" : text.split_html(text.extr(
+ page, ">Genres</h4>", "</div>")),
+ "publisher" : text.remove_html(text.extr(
+ page, '>Serialization</h4>', "</div>")),
+ }
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 9c335ad..ff771fb 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -204,58 +204,6 @@ class PinterestExtractor(Extractor):
return media
-class PinterestPinExtractor(PinterestExtractor):
- """Extractor for images from a single pin from pinterest.com"""
- subcategory = "pin"
- pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)"
- example = "https://www.pinterest.com/pin/12345/"
-
- def __init__(self, match):
- PinterestExtractor.__init__(self, match)
- self.pin_id = match[1]
- self.pin = None
-
- def metadata(self):
- self.pin = self.api.pin(self.pin_id)
- return self.pin
-
- def pins(self):
- return (self.pin,)
-
-
-class PinterestBoardExtractor(PinterestExtractor):
- """Extractor for images from a board from pinterest.com"""
- subcategory = "board"
- directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
- archive_fmt = "{board[id]}_{id}"
- pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)"
- r"/(?!_saved|_created|pins/)([^/?#]+)/?(?:$|\?|#)")
- example = "https://www.pinterest.com/USER/BOARD/"
-
- def __init__(self, match):
- PinterestExtractor.__init__(self, match)
- self.user = text.unquote(match[1])
- self.board_name = text.unquote(match[2])
- self.board = None
-
- def metadata(self):
- self.board = self.api.board(self.user, self.board_name)
- return {"board": self.board}
-
- def pins(self):
- board = self.board
- pins = self.api.board_pins(board["id"])
-
- if board["section_count"] and self.config("sections", True):
- base = f"{self.root}{board['url']}id:"
- data = {"_extractor": PinterestSectionExtractor}
- sections = [(base + section["id"], data)
- for section in self.api.board_sections(board["id"])]
- pins = itertools.chain(pins, sections)
-
- return pins
-
-
class PinterestUserExtractor(PinterestExtractor):
"""Extractor for a user's boards"""
subcategory = "user"
@@ -357,6 +305,58 @@ class PinterestSearchExtractor(PinterestExtractor):
return self.api.search(self.search)
+class PinterestPinExtractor(PinterestExtractor):
+ """Extractor for images from a single pin from pinterest.com"""
+ subcategory = "pin"
+ pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)"
+ example = "https://www.pinterest.com/pin/12345/"
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.pin_id = match[1]
+ self.pin = None
+
+ def metadata(self):
+ self.pin = self.api.pin(self.pin_id)
+ return self.pin
+
+ def pins(self):
+ return (self.pin,)
+
+
+class PinterestBoardExtractor(PinterestExtractor):
+ """Extractor for images from a board from pinterest.com"""
+ subcategory = "board"
+ directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
+ archive_fmt = "{board[id]}_{id}"
+ pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)"
+ r"/([^/?#]+)/?(?!.*#related$)")
+ example = "https://www.pinterest.com/USER/BOARD/"
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.user = text.unquote(match[1])
+ self.board_name = text.unquote(match[2])
+ self.board = None
+
+ def metadata(self):
+ self.board = self.api.board(self.user, self.board_name)
+ return {"board": self.board}
+
+ def pins(self):
+ board = self.board
+ pins = self.api.board_pins(board["id"])
+
+ if board["section_count"] and self.config("sections", True):
+ base = f"{self.root}{board['url']}id:"
+ data = {"_extractor": PinterestSectionExtractor}
+ sections = [(base + section["id"], data)
+ for section in self.api.board_sections(board["id"])]
+ pins = itertools.chain(pins, sections)
+
+ return pins
+
+
class PinterestRelatedPinExtractor(PinterestPinExtractor):
"""Extractor for related pins of another pin from pinterest.com"""
subcategory = "related-pin"
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 9febda9..e20d80e 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -56,6 +56,7 @@ class RedditExtractor(Extractor):
urls = []
if submission:
+ submission["comment"] = None
submission["date"] = text.parse_timestamp(
submission["created_utc"])
yield Message.Directory, submission
@@ -99,14 +100,13 @@ class RedditExtractor(Extractor):
elif not submission["is_self"]:
urls.append((url, submission))
+ if selftext and (txt := submission["selftext_html"]):
+ for url in text.extract_iter(txt, ' href="', '"'):
+ urls.append((url, submission))
+
elif parentdir:
yield Message.Directory, comments[0]
- if selftext and submission:
- for url in text.extract_iter(
- submission["selftext_html"] or "", ' href="', '"'):
- urls.append((url, submission))
-
if self.api.comments:
if comments and not submission:
submission = comments[0]
@@ -115,24 +115,24 @@ class RedditExtractor(Extractor):
yield Message.Directory, submission
for comment in comments:
+ media = (embeds and "media_metadata" in comment)
html = comment["body_html"] or ""
href = (' href="' in html)
- media = (embeds and "media_metadata" in comment)
- if media or href:
- comment["date"] = text.parse_timestamp(
- comment["created_utc"])
- if submission:
- data = submission.copy()
- data["comment"] = comment
- else:
- data = comment
+ if not media and not href:
+ continue
+
+ data = submission.copy()
+ data["comment"] = comment
+ comment["date"] = text.parse_timestamp(
+ comment["created_utc"])
if media:
- for embed in self._extract_embed(comment):
- submission["num"] += 1
- text.nameext_from_url(embed, submission)
- yield Message.Url, embed, submission
+ for url in self._extract_embed(comment):
+ data["num"] += 1
+ text.nameext_from_url(url, data)
+ yield Message.Url, url, data
+ submission["num"] = data["num"]
if href:
for url in text.extract_iter(html, ' href="', '"'):
diff --git a/gallery_dl/extractor/schalenetwork.py b/gallery_dl/extractor/schalenetwork.py
index d517287..dc42417 100644
--- a/gallery_dl/extractor/schalenetwork.py
+++ b/gallery_dl/extractor/schalenetwork.py
@@ -10,7 +10,6 @@
from .common import GalleryExtractor, Extractor, Message
from .. import text, exception
-from ..cache import cache
import collections
BASE_PATTERN = (
@@ -27,6 +26,8 @@ class SchalenetworkExtractor(Extractor):
category = "schalenetwork"
root = "https://niyaniya.moe"
root_api = "https://api.schale.network"
+ root_auth = "https://auth.schale.network"
+ extr_class = None
request_interval = (0.5, 1.5)
def _init(self):
@@ -38,6 +39,7 @@ class SchalenetworkExtractor(Extractor):
def _pagination(self, endpoint, params):
url_api = self.root_api + endpoint
+ cls = self.extr_class
while True:
data = self.request_json(
@@ -49,8 +51,8 @@ class SchalenetworkExtractor(Extractor):
return
for entry in entries:
- url = f"{self.root}/g/{entry['id']}/{entry['public_key']}"
- entry["_extractor"] = SchalenetworkGalleryExtractor
+ url = f"{self.root}/g/{entry['id']}/{entry['key']}"
+ entry["_extractor"] = cls
yield Message.Queue, url, entry
try:
@@ -60,6 +62,34 @@ class SchalenetworkExtractor(Extractor):
pass
params["page"] += 1
+ def _token(self):
+ if token := self.config("token"):
+ return f"Bearer {token.rpartition(' ')[2]}"
+ raise exception.AuthRequired("'token'", "your favorites")
+
+ def _crt(self):
+ crt = self.config("crt")
+ if not crt:
+ self._require_auth()
+
+ if not text.re(r"^[0-9a-f-]+$").match(crt):
+ path, _, qs = crt.partition("?")
+ if not qs:
+ qs = path
+ crt = text.parse_query(qs).get("crt")
+ if not crt:
+ self._require_auth()
+
+ return crt
+
+ def _require_auth(self, exc=None):
+ if exc is None:
+ msg = None
+ else:
+ msg = f"{exc.status} {exc.response.reason}"
+ raise exception.AuthRequired(
+ "'crt' query parameter & matching '--user-agent'", None, msg)
+
class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
"""Extractor for schale.network galleries"""
@@ -67,7 +97,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
directory_fmt = ("{category}", "{id} {title}")
archive_fmt = "{id}_{num}"
request_interval = 0.0
- pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)"
+ pattern = rf"{BASE_PATTERN}/(?:g|reader)/(\d+)/(\w+)"
example = "https://niyaniya.moe/g/12345/67890abcde/"
TAG_TYPES = {
@@ -86,27 +116,10 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
12: "other",
}
- def __init__(self, match):
- GalleryExtractor.__init__(self, match)
- self.page_url = None
-
- def _init(self):
- self.headers = {
- "Accept" : "*/*",
- "Referer": self.root + "/",
- "Origin" : self.root,
- }
-
- self.fmt = self.config("format")
- self.cbz = self.config("cbz", True)
-
- if self.cbz:
- self.filename_fmt = "{id} {title}.{extension}"
- self.directory_fmt = ("{category}",)
-
def metadata(self, _):
- url = f"{self.root_api}/books/detail/{self.groups[1]}/{self.groups[2]}"
- self.data = data = self.request_json(url, headers=self.headers)
+ _, gid, gkey = self.groups
+ url = f"{self.root_api}/books/detail/{gid}/{gkey}"
+ data = self.request_json(url, headers=self.headers)
data["date"] = text.parse_timestamp(data["created_at"] // 1000)
tags = []
@@ -127,53 +140,42 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
data["tags_" + types[type]] = values
try:
- if self.cbz:
- data["count"] = len(data["thumbnails"]["entries"])
+ data["count"] = len(data["thumbnails"]["entries"])
del data["thumbnails"]
- del data["rels"]
except Exception:
pass
return data
def images(self, _):
- data = self.data
- fmt = self._select_format(data["data"])
+ crt = self._crt()
+ _, gid, gkey = self.groups
+ url = f"{self.root_api}/books/detail/{gid}/{gkey}?crt={crt}"
+ try:
+ data = self.request_json(url, method="POST", headers=self.headers)
+ except exception.HttpError as exc:
+ self._require_auth(exc)
- url = (f"{self.root_api}/books/data/{data['id']}/"
- f"{data['public_key']}/{fmt['id']}/{fmt['public_key']}")
- params = {
- "v": data["updated_at"],
- "w": fmt["w"],
- }
+ fmt = self._select_format(data["data"])
- if self.cbz:
- params["action"] = "dl"
- base = self.request_json(
- url, method="POST", params=params, headers=self.headers,
- )["base"]
- url = f"{base}?v={data['updated_at']}&w={fmt['w']}"
- info = text.nameext_from_url(base)
- if not info["extension"]:
- info["extension"] = "cbz"
- return ((url, info),)
-
- data = self.request_json(url, params=params, headers=self.headers)
+ url = (f"{self.root_api}/books/data/{gid}/{gkey}"
+ f"/{fmt['id']}/{fmt['key']}/{fmt['w']}?crt={crt}")
+ data = self.request_json(url, headers=self.headers)
base = data["base"]
results = []
for entry in data["entries"]:
dimensions = entry["dimensions"]
info = {
- "w": dimensions[0],
- "h": dimensions[1],
+ "width" : dimensions[0],
+ "height": dimensions[1],
"_http_headers": self.headers,
}
results.append((base + entry["path"], info))
return results
def _select_format(self, formats):
- fmt = self.fmt
+ fmt = self.config("format")
if not fmt or fmt == "best":
fmtids = ("0", "1600", "1280", "980", "780")
@@ -182,7 +184,7 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
elif isinstance(fmt, list):
fmtids = fmt
else:
- fmtids = (str(self.fmt),)
+ fmtids = (str(fmt),)
for fmtid in fmtids:
try:
@@ -203,44 +205,39 @@ class SchalenetworkGalleryExtractor(SchalenetworkExtractor, GalleryExtractor):
class SchalenetworkSearchExtractor(SchalenetworkExtractor):
"""Extractor for schale.network search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/\?([^#]*)"
- example = "https://niyaniya.moe/?s=QUERY"
+ pattern = rf"{BASE_PATTERN}/(?:tag/([^/?#]+)|browse)?(?:/?\?([^#]*))?$"
+ example = "https://niyaniya.moe/browse?s=QUERY"
def items(self):
- params = text.parse_query(self.groups[1])
+ _, tag, qs = self.groups
+
+ params = text.parse_query(qs)
params["page"] = text.parse_int(params.get("page"), 1)
+
+ if tag is not None:
+ ns, sep, tag = text.unquote(tag).partition(":")
+ if "+" in tag:
+ tag = tag.replace("+", " ")
+ q = '"'
+ else:
+ q = ""
+ q = '"' if " " in tag else ""
+ params["s"] = f"{ns}{sep}{q}^{tag}${q}"
+
return self._pagination("/books", params)
class SchalenetworkFavoriteExtractor(SchalenetworkExtractor):
"""Extractor for schale.network favorites"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
+ pattern = rf"{BASE_PATTERN}/favorites(?:\?([^#]*))?"
example = "https://niyaniya.moe/favorites"
def items(self):
- self.login()
-
params = text.parse_query(self.groups[1])
params["page"] = text.parse_int(params.get("page"), 1)
- return self._pagination("/favorites", params)
-
- def login(self):
- username, password = self._get_auth_info()
- if username:
- self.headers["Authorization"] = \
- "Bearer " + self._login_impl(username, password)
- return
-
- raise exception.AuthenticationError("Username and password required")
-
- @cache(maxage=86400, keyarg=1)
- def _login_impl(self, username, password):
- self.log.info("Logging in as %s", username)
+ self.headers["Authorization"] = self._token()
+ return self._pagination(f"/books/favorites?crt={self._crt()}", params)
- url = "https://auth.schale.network/login"
- data = {"uname": username, "passwd": password}
- response = self.request(
- url, method="POST", headers=self.headers, data=data)
- return response.json()["session"]
+SchalenetworkExtractor.extr_class = SchalenetworkGalleryExtractor
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py
index 8cc7e38..3354289 100644
--- a/gallery_dl/extractor/simpcity.py
+++ b/gallery_dl/extractor/simpcity.py
@@ -20,18 +20,20 @@ class SimpcityExtractor(Extractor):
root = "https://simpcity.cr"
def items(self):
- extract_urls = text.re(r' href="([^"]+)').findall
+ extract_urls = text.re(
+ r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
for post in self.posts():
urls = extract_urls(post["content"])
data = {"post": post}
post["count"] = data["count"] = len(urls)
+ yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Queue, url, data
def request_page(self, url):
try:
- return self.request(url).text
+ return self.request(url)
except exception.HttpError as exc:
if exc.status == 403 and b">Log in<" in exc.response.content:
msg = text.extr(exc.response.text, "blockMessage--error", "</")
@@ -44,14 +46,14 @@ class SimpcityExtractor(Extractor):
base = f"{self.root}{base}"
if pnum is None:
- url = base
+ url = f"{base}/"
pnum = 1
else:
url = f"{base}/page-{pnum}"
pnum = None
while True:
- page = self.request_page(url)
+ page = self.request_page(url).text
yield page
@@ -60,6 +62,31 @@ class SimpcityExtractor(Extractor):
pnum += 1
url = f"{base}/page-{pnum}"
+ def _pagination_reverse(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ url = f"{base}/page-9999" # force redirect to last page
+ with self.request_page(url) as response:
+ url = response.url
+ if url[-1] == "/":
+ pnum = 1
+ else:
+ pnum = text.parse_int(url[url.rfind("-")+1:], 1)
+ page = response.text
+
+ while True:
+ yield page
+
+ pnum -= 1
+ if pnum > 1:
+ url = f"{base}/page-{pnum}"
+ elif pnum == 1:
+ url = f"{base}/"
+ else:
+ return
+
+ page = self.request_page(url).text
+
def _parse_thread(self, page):
schema = self._extract_jsonld(page)["mainEntity"]
author = schema["author"]
@@ -92,7 +119,8 @@ class SimpcityExtractor(Extractor):
"id": extr('data-content="post-', '"'),
"author_url": extr('itemprop="url" content="', '"'),
"date": text.parse_datetime(extr('datetime="', '"')),
- "content": extr('<div itemprop="text">', "\t\t</div>").strip(),
+ "content": extr('<div itemprop="text">',
+ '<div class="js-selectToQuote').strip(),
}
url_a = post["author_url"]
@@ -109,7 +137,7 @@ class SimpcityPostExtractor(SimpcityExtractor):
def posts(self):
post_id = self.groups[0]
url = f"{self.root}/posts/{post_id}/"
- page = self.request_page(url)
+ page = self.request_page(url).text
pos = page.find(f'data-content="post-{post_id}"')
if pos < 0:
@@ -126,10 +154,22 @@ class SimpcityThreadExtractor(SimpcityExtractor):
example = "https://simpcity.cr/threads/TITLE.12345/"
def posts(self):
- for page in self._pagination(*self.groups):
+ if (order := self.config("order-posts")) and \
+ order[0] not in ("d", "r"):
+ pages = self._pagination(*self.groups)
+ reverse = False
+ else:
+ pages = self._pagination_reverse(*self.groups)
+ reverse = True
+
+ for page in pages:
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)
- for html in text.extract_iter(page, "<article ", "</article>"):
+ posts = text.extract_iter(page, "<article ", "</article>")
+ if reverse:
+ posts = list(posts)
+ posts.reverse()
+ for html in posts:
yield self._parse_post(html)
diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py
new file mode 100644
index 0000000..055d7d8
--- /dev/null
+++ b/gallery_dl/extractor/thehentaiworld.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://thehentaiworld.com/"""
+
+from .common import Extractor, Message
+from .. import text, util
+import collections
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com"
+
+
+class ThehentaiworldExtractor(Extractor):
+ """Base class for thehentaiworld extractors"""
+ category = "thehentaiworld"
+ root = "https://thehentaiworld.com"
+ filename_fmt = "{title} ({id}{num:?-//}).{extension}"
+ archive_fmt = "{id}_{num}"
+ request_interval = (0.5, 1.5)
+
+ def items(self):
+ for url in self.posts():
+ try:
+ post = self._extract_post(url)
+ except Exception as exc:
+ self.status |= 1
+ self.log.warning("Failed to extract post %s (%s: %s)",
+ url, exc.__class__.__name__, exc)
+ continue
+
+ if "file_urls" in post:
+ urls = post["file_urls"]
+ post["count"] = len(urls)
+ yield Message.Directory, post
+ for post["num"], url in enumerate(urls, 1):
+ text.nameext_from_url(url, post)
+ yield Message.Url, url, post
+ else:
+ yield Message.Directory, post
+ url = post["file_url"]
+ text.nameext_from_url(url, post)
+ yield Message.Url, url, post
+
+ def _extract_post(self, url):
+ extr = text.extract_from(self.request(url).text)
+
+ post = {
+ "num" : 0,
+ "count" : 1,
+ "title" : text.unescape(extr("<title>", "<").strip()),
+ "id" : text.parse_int(extr(" postid-", " ")),
+ "slug" : extr(" post-", '"'),
+ "tags" : extr('id="tagsHead">', "</ul>"),
+ "date" : text.parse_datetime(extr(
+ "<li>Posted: ", "<"), "%Y-%m-%d"),
+ }
+
+ if "/videos/" in url:
+ post["type"] = "video"
+ post["width"] = post["height"] = 0
+ post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
+ post["score"] = text.parse_float(extr("<strong>", "<"))
+ post["file_url"] = extr('<source src="', '"')
+ else:
+ post["type"] = "image"
+ post["width"] = text.parse_int(extr("<li>Size: ", " "))
+ post["height"] = text.parse_int(extr("x ", "<"))
+ post["file_url"] = extr('a href="', '"')
+ post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
+ post["score"] = text.parse_float(extr("<strong>", "<"))
+
+ if doujin := extr('<a id="prev-page"', "</div></div><"):
+ repl = text.re(r"-220x\d+\.").sub
+ post["file_urls"] = [
+ repl(".", url)
+ for url in text.extract_iter(
+ doujin, 'class="border" src="', '"')
+ ]
+
+ tags = collections.defaultdict(list)
+ pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)')
+ for tag_type, tag_name in pattern.findall(post["tags"]):
+ tags[tag_type].append(tag_name)
+ post["tags"] = tags_list = []
+ for key, value in tags.items():
+ tags_list.extend(value)
+ post[f"tags_{key}" if key else "tags_general"] = value
+
+ return post
+
+ def _pagination(self, endpoint):
+ base = f"{self.root}{endpoint}"
+ pnum = self.page_start
+
+ while True:
+ url = base if pnum < 2 else f"{base}page/{pnum}/"
+ page = self.request(url).text
+
+ yield from text.extract_iter(text.extr(
+ page, 'id="thumbContainer"', "<script"), ' href="', '"')
+
+ if 'class="next"' not in page:
+ return
+ pnum += 1
+
+
+class ThehentaiworldPostExtractor(ThehentaiworldExtractor):
+ subcategory = "post"
+ pattern = (rf"{BASE_PATTERN}"
+ rf"(/(?:(?:3d-cgi-)?hentai-image|video)s/([^/?#]+))")
+ example = "https://thehentaiworld.com/hentai-images/SLUG/"
+
+ def posts(self):
+ return (f"{self.root}{self.groups[0]}/",)
+
+
+class ThehentaiworldTagExtractor(ThehentaiworldExtractor):
+ subcategory = "tag"
+ per_page = 24
+ page_start = 1
+ post_start = 0
+ directory_fmt = ("{category}", "{search_tags}")
+ pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)"
+ example = "https://thehentaiworld.com/tag/TAG/"
+
+ def posts(self):
+ self.kwdict["search_tags"] = tag = self.groups[0]
+ return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start)
+
+ def skip(self, num):
+ pages, posts = divmod(num, self.per_page)
+ self.page_start += pages
+ self.post_start += posts
+ return num
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ed3cfae..e6c84d1 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -2070,7 +2070,7 @@ class TwitterAPI():
quoted = tweet["quoted_status_result"]["result"]
quoted["legacy"]["quoted_by"] = (
tweet["core"]["user_results"]["result"]
- ["legacy"]["screen_name"])
+ ["core"]["screen_name"])
quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
quoted["sortIndex"] = entry.get("sortIndex")
diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py
index e53ecf4..294fc57 100644
--- a/gallery_dl/extractor/vipergirls.py
+++ b/gallery_dl/extractor/vipergirls.py
@@ -51,8 +51,16 @@ class VipergirlsExtractor(Extractor):
like = False
posts = root.iter("post")
- if self.page:
- util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15)
+ if (order := self.config("order-posts")) and \
+ order[0] not in ("d", "r"):
+ if self.page:
+ util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15)
+ else:
+ posts = list(posts)
+ if self.page:
+ offset = text.parse_int(self.page[5:]) * 15
+ posts = posts[:offset]
+ posts.reverse()
for post in posts:
images = list(post)
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 9d98e68..9369e5d 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -489,9 +489,6 @@ class DownloadJob(Job):
self.extractor.cookies_store()
- if "finalize" in hooks:
- for callback in hooks["finalize"]:
- callback(pathfmt)
if self.status:
if "finalize-error" in hooks:
for callback in hooks["finalize-error"]:
@@ -500,6 +497,9 @@ class DownloadJob(Job):
if "finalize-success" in hooks:
for callback in hooks["finalize-success"]:
callback(pathfmt)
+ if "finalize" in hooks:
+ for callback in hooks["finalize"]:
+ callback(pathfmt)
def handle_skip(self):
pathfmt = self.pathfmt
diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py
index 8da8417..9992c56 100644
--- a/gallery_dl/postprocessor/common.py
+++ b/gallery_dl/postprocessor/common.py
@@ -54,7 +54,11 @@ class PostProcessor():
else:
self.log.debug(
"Using %s archive '%s'", self.name, archive_path)
+ job.register_hooks({"finalize": self._close_archive})
return True
self.archive = None
return False
+
+ def _close_archive(self, _):
+ self.archive.close()
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index c74f92f..a6d2b7f 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -45,6 +45,15 @@ class MetadataPP(PostProcessor):
cfmt = "\n".join(cfmt) + "\n"
self._content_fmt = formatter.parse(cfmt).format_map
ext = "txt"
+ elif mode == "print":
+ nl = "\n"
+ if isinstance(cfmt, list):
+ cfmt = f"{nl.join(cfmt)}{nl}"
+ if cfmt[-1] != nl and (cfmt[0] != "\f" or cfmt[1] == "F"):
+ cfmt = f"{cfmt}{nl}"
+ self.write = self._write_custom
+ self._content_fmt = formatter.parse(cfmt).format_map
+ filename = "-"
elif mode == "jsonl":
self.write = self._write_json
self._json_encode = self._make_encoder(options).encode
diff --git a/gallery_dl/postprocessor/python.py b/gallery_dl/postprocessor/python.py
index db71da2..66d9343 100644
--- a/gallery_dl/postprocessor/python.py
+++ b/gallery_dl/postprocessor/python.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2023 Mike Fährmann
+# Copyright 2023-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -17,13 +17,14 @@ class PythonPP(PostProcessor):
def __init__(self, job, options):
PostProcessor.__init__(self, job)
- spec = options["function"]
- module_name, _, function_name = spec.rpartition(":")
- module = util.import_file(module_name)
- self.function = getattr(module, function_name)
-
- if self._init_archive(job, options):
- self.run = self.run_archive
+ mode = options.get("mode")
+ if mode == "eval" or not mode and options.get("expression"):
+ self.function = util.compile_expression(options["expression"])
+ else:
+ spec = options["function"]
+ module_name, _, function_name = spec.rpartition(":")
+ module = util.import_file(module_name)
+ self.function = getattr(module, function_name)
events = options.get("event")
if events is None:
@@ -32,6 +33,9 @@ class PythonPP(PostProcessor):
events = events.split(",")
job.register_hooks({event: self.run for event in events}, options)
+ if self._init_archive(job, options):
+ self.run = self.run_archive
+
def run(self, pathfmt):
self.function(pathfmt.kwdict)
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 7b9ce99..49c1ba8 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -542,6 +542,7 @@ def language_to_code(lang, default=None):
CODES = {
"ar": "Arabic",
"bg": "Bulgarian",
+ "bn": "Bengali",
"ca": "Catalan",
"cs": "Czech",
"da": "Danish",
@@ -549,9 +550,11 @@ CODES = {
"el": "Greek",
"en": "English",
"es": "Spanish",
+ "fa": "Persian",
"fi": "Finnish",
"fr": "French",
"he": "Hebrew",
+ "hi": "Hindi",
"hu": "Hungarian",
"id": "Indonesian",
"it": "Italian",
@@ -564,9 +567,13 @@ CODES = {
"pt": "Portuguese",
"ro": "Romanian",
"ru": "Russian",
+ "sk": "Slovak",
+ "sl": "Slovenian",
+ "sr": "Serbian",
"sv": "Swedish",
"th": "Thai",
"tr": "Turkish",
+ "uk": "Ukrainian",
"vi": "Vietnamese",
"zh": "Chinese",
}
@@ -634,6 +641,12 @@ class NullResponse():
self.url = url
self.reason = str(reason)
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ pass
+
def __str__(self):
return "900 " + self.reason
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 277d679..4861a9d 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,5 +6,5 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.30.7"
+__version__ = "1.30.8"
__variant__ = None
diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py
index cfc6b50..0296498 100644
--- a/gallery_dl/ytdl.py
+++ b/gallery_dl/ytdl.py
@@ -80,7 +80,10 @@ def parse_command_line(module, argv):
parser, opts, args = module.parseOpts(argv)
ytdlp = hasattr(module, "cookies")
- std_headers = module.std_headers
+ try:
+ std_headers = module.utils.networking.std_headers
+ except AttributeError:
+ std_headers = module.std_headers
try:
parse_bytes = module.parse_bytes
@@ -345,7 +348,7 @@ def parse_command_line(module, argv):
"nopart": opts.nopart,
"updatetime": opts.updatetime,
"writedescription": opts.writedescription,
- "writeannotations": opts.writeannotations,
+ "writeannotations": getattr(opts, "writeannotations", None),
"writeinfojson": opts.writeinfojson,
"allow_playlist_files": opts.allow_playlist_files,
"clean_infojson": opts.clean_infojson,
@@ -378,7 +381,8 @@ def parse_command_line(module, argv):
"max_views": opts.max_views,
"daterange": date,
"cachedir": opts.cachedir,
- "youtube_print_sig_code": opts.youtube_print_sig_code,
+ "youtube_print_sig_code": getattr(
+ opts, "youtube_print_sig_code", None),
"age_limit": opts.age_limit,
"download_archive": download_archive_fn,
"break_on_existing": getattr(opts, "break_on_existing", None),
@@ -394,8 +398,8 @@ def parse_command_line(module, argv):
"socket_timeout": opts.socket_timeout,
"bidi_workaround": opts.bidi_workaround,
"debug_printtraffic": opts.debug_printtraffic,
- "prefer_ffmpeg": opts.prefer_ffmpeg,
- "include_ads": opts.include_ads,
+ "prefer_ffmpeg": getattr(opts, "prefer_ffmpeg", None),
+ "include_ads": getattr(opts, "include_ads", None),
"default_search": opts.default_search,
"dynamic_mpd": getattr(opts, "dynamic_mpd", None),
"extractor_args": getattr(opts, "extractor_args", None),
@@ -420,7 +424,7 @@ def parse_command_line(module, argv):
opts, "sleep_interval_subtitles", None),
"external_downloader": opts.external_downloader,
"playlist_items": opts.playlist_items,
- "xattr_set_filesize": opts.xattr_set_filesize,
+ "xattr_set_filesize": getattr(opts, "xattr_set_filesize", None),
"match_filter": match_filter,
"no_color": getattr(opts, "no_color", None),
"ffmpeg_location": opts.ffmpeg_location,
@@ -430,7 +434,7 @@ def parse_command_line(module, argv):
opts, "hls_split_discontinuity", None),
"external_downloader_args": opts.external_downloader_args,
"postprocessor_args": opts.postprocessor_args,
- "cn_verification_proxy": opts.cn_verification_proxy,
+ "cn_verification_proxy": getattr(opts, "cn_verification_proxy", None),
"geo_verification_proxy": opts.geo_verification_proxy,
"geo_bypass": getattr(
opts, "geo_bypass", "default"),