aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2024-12-22 05:45:18 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2024-12-22 05:45:18 -0500
commitbb8260277ab7483652c6c1526a15d62da92acc96 (patch)
tree02959c9d5aceb66f4429e0be1bc927921e01bbdc
parentf6877087773089220d68288d055276fca6c556d4 (diff)
New upstream version 1.28.2.upstream/1.28.2
-rw-r--r--CHANGELOG.md47
-rw-r--r--PKG-INFO6
-rw-r--r--README.rst4
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.532
-rw-r--r--docs/gallery-dl.conf2
-rw-r--r--gallery_dl.egg-info/PKG-INFO6
-rw-r--r--gallery_dl.egg-info/SOURCES.txt2
-rw-r--r--gallery_dl/downloader/http.py11
-rw-r--r--gallery_dl/extractor/__init__.py2
-rw-r--r--gallery_dl/extractor/bilibili.py10
-rw-r--r--gallery_dl/extractor/bluesky.py4
-rw-r--r--gallery_dl/extractor/cohost.py30
-rw-r--r--gallery_dl/extractor/common.py25
-rw-r--r--gallery_dl/extractor/cyberdrop.py24
-rw-r--r--gallery_dl/extractor/deviantart.py26
-rw-r--r--gallery_dl/extractor/facebook.py3
-rw-r--r--gallery_dl/extractor/instagram.py10
-rw-r--r--gallery_dl/extractor/itaku.py46
-rw-r--r--gallery_dl/extractor/kemonoparty.py4
-rw-r--r--gallery_dl/extractor/lofter.py147
-rw-r--r--gallery_dl/extractor/recursive.py3
-rw-r--r--gallery_dl/extractor/saint.py2
-rw-r--r--gallery_dl/extractor/tapas.py124
-rw-r--r--gallery_dl/extractor/yiffverse.py157
-rw-r--r--gallery_dl/extractor/zerochan.py37
-rw-r--r--gallery_dl/job.py1
-rw-r--r--gallery_dl/util.py8
-rw-r--r--gallery_dl/version.py2
29 files changed, 645 insertions, 132 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b831cd4..2df827d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,26 +1,31 @@
-## 1.28.1 - 2024-12-07
+## 1.28.2 - 2024-12-20
### Extractors
#### Additions
-- [bluesky] add `info` extractor
+- [cyberdrop] add extractor for media URLs ([#2496](https://github.com/mikf/gallery-dl/issues/2496))
+- [itaku] add `search` extractor ([#6613](https://github.com/mikf/gallery-dl/issues/6613))
+- [lofter] add initial support ([#650](https://github.com/mikf/gallery-dl/issues/650), [#2294](https://github.com/mikf/gallery-dl/issues/2294), [#4095](https://github.com/mikf/gallery-dl/issues/4095), [#4728](https://github.com/mikf/gallery-dl/issues/4728), [#5656](https://github.com/mikf/gallery-dl/issues/5656), [#6607](https://github.com/mikf/gallery-dl/issues/6607))
+- [yiffverse] add support ([#6611](https://github.com/mikf/gallery-dl/issues/6611))
#### Fixes
-- [bluesky] fix exception when encountering non-quote embeds ([#6577](https://github.com/mikf/gallery-dl/issues/6577))
-- [bluesky] unescape search queries ([#6579](https://github.com/mikf/gallery-dl/issues/6579))
-- [common] restore using environment proxy settings by default ([#6553](https://github.com/mikf/gallery-dl/issues/6553), [#6609](https://github.com/mikf/gallery-dl/issues/6609))
-- [common] improve handling of `user-agent` settings ([#6594](https://github.com/mikf/gallery-dl/issues/6594))
-- [e621] fix `TypeError` when `metadata` is enabled ([#6587](https://github.com/mikf/gallery-dl/issues/6587))
-- [gofile] fix website token extraction ([#6596](https://github.com/mikf/gallery-dl/issues/6596))
-- [inkbunny] fix re-login loop ([#6618](https://github.com/mikf/gallery-dl/issues/6618))
-- [instagram] handle empty `carousel_media` entries ([#6595](https://github.com/mikf/gallery-dl/issues/6595))
-- [kemonoparty] fix `o` query parameter handling ([#6597](https://github.com/mikf/gallery-dl/issues/6597))
-- [nhentai] fix download URLs ([#6620](https://github.com/mikf/gallery-dl/issues/6620))
-- [readcomiconline] fix `chapter` extraction ([#6070](https://github.com/mikf/gallery-dl/issues/6070), [#6335](https://github.com/mikf/gallery-dl/issues/6335))
-- [realbooru] fix extraction ([#6543](https://github.com/mikf/gallery-dl/issues/6543))
-- [rule34] fix `favorite` extraction ([#6573](https://github.com/mikf/gallery-dl/issues/6573))
-- [zerochan] download `.webp` and `.gif` files ([#6576](https://github.com/mikf/gallery-dl/issues/6576))
+- [facebook] decode Unicode surrogate pairs in metadata values ([#6599](https://github.com/mikf/gallery-dl/issues/6599))
+- [zerochan] parse API responses manually when receiving invalid JSON ([#6632](https://github.com/mikf/gallery-dl/issues/6632))
+- [zerochan] fix `source` metadata extraction when not logged in
#### Improvements
-- [hentaicosplays] update domains ([#6578](https://github.com/mikf/gallery-dl/issues/6578))
-- [pixiv:ranking] implement filtering results by `content` ([#6574](https://github.com/mikf/gallery-dl/issues/6574))
-- [pixiv] include user ID in failed AJAX request warnings ([#6581](https://github.com/mikf/gallery-dl/issues/6581))
+- [bilibili] extract files from `module_top` entries ([#6687](https://github.com/mikf/gallery-dl/issues/6687))
+- [bilibili] support `/upload/opus` URLs ([#6687](https://github.com/mikf/gallery-dl/issues/6687))
+- [bluesky] default to `posts` timeline when `reposts` or `quoted` is enabled ([#6583](https://github.com/mikf/gallery-dl/issues/6583))
+- [common] simplify HTTP error messages
+- [common] detect `DDoS-Guard` challenge pages
+- [deviantart] improve `tiptap` markup to HTML conversion ([#6686](https://github.com/mikf/gallery-dl/issues/6686))
+ - fix `KeyError: 'attrs'` for links without `href`
+ - support `heading` content blocks
+ - support `strike` text markers
+- [instagram] extract `date` metadata for stories ([#6677](https://github.com/mikf/gallery-dl/issues/6677))
+- [kemonoparty:favorite] support new URL format ([#6676](https://github.com/mikf/gallery-dl/issues/6676))
+- [saint] support `saint2.cr` URLs ([#6692](https://github.com/mikf/gallery-dl/issues/6692))
+- [tapas] improve extractor hierarchy ([#6680](https://github.com/mikf/gallery-dl/issues/6680))
#### Options
-- [patreon] add `format-images` option ([#6569](https://github.com/mikf/gallery-dl/issues/6569))
-- [zerochan] add `extensions` option ([#6576](https://github.com/mikf/gallery-dl/issues/6576))
+- [cohost] add `avatar` and `background` options ([#6656](https://github.com/mikf/gallery-dl/issues/6656))
+### Miscellaneous
+- support `*` wildcards for `parent>child` categories, for example `reddit>*` ([#6673](https://github.com/mikf/gallery-dl/issues/6673))
+- use latest Firefox UA as default `user-agent`
+- use random unused port for `"user-agent": "browser"` requests
diff --git a/PKG-INFO b/PKG-INFO
index f82026d..d5fce98 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.28.1
+Version: 1.28.2
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -117,9 +117,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.28.1/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.28.2/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.28.1/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.28.2/gallery-dl.bin>`__
Nightly Builds
diff --git a/README.rst b/README.rst
index 63d400f..240dfe5 100644
--- a/README.rst
+++ b/README.rst
@@ -76,9 +76,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.28.1/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.28.2/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.28.1/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.28.2/gallery-dl.bin>`__
Nightly Builds
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 96c01a0..3d84f58 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2024-12-07" "1.28.1" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2024-12-20" "1.28.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index e2c1e14..c27f632 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2024-12-07" "1.28.1" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2024-12-20" "1.28.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -1697,7 +1697,12 @@ Download embedded videos hosted on https://www.blogger.com/
* \f[I]list\f[] of \f[I]strings\f[]
.IP "Default:" 9
-\f[I]"media"\f[]
+.br
+* \f[I]"posts"\f[] if
+\f[I]reposts\f[] or
+\f[I]quoted\f[] is enabled
+.br
+* \f[I]"media"\f[] otherwise
.IP "Example:" 4
.br
@@ -1710,6 +1715,7 @@ A (comma-separated) list of subcategories to include
when processing a user profile.
Possible values are
+\f[I]"info"\f[],
\f[I]"avatar"\f[],
\f[I]"background"\f[],
\f[I]"posts"\f[],
@@ -2057,6 +2063,28 @@ to download images in JPEG format at their original resolution.
Extract \f[I]ask\f[] posts.
+.SS extractor.cohost.avatar
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download \f[I]avatar\f[] images.
+
+
+.SS extractor.cohost.background
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download \f[I]background\f[]/\f[I]banner\f[]/\f[I]header\f[] images.
+
+
.SS extractor.cohost.pinned
.IP "Type:" 6
\f[I]bool\f[]
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 4dc2e14..3d73869 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -176,6 +176,8 @@
"cohost":
{
"asks" : true,
+ "avatar" : false,
+ "background": false,
"pinned" : false,
"replies": true,
"shares" : true
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index f82026d..d5fce98 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.28.1
+Version: 1.28.2
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Download-URL: https://github.com/mikf/gallery-dl/releases/latest
@@ -117,9 +117,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.28.1/gallery-dl.exe>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.28.2/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.28.1/gallery-dl.bin>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.28.2/gallery-dl.bin>`__
Nightly Builds
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 398c9f7..42dd483 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -139,6 +139,7 @@ gallery_dl/extractor/lensdump.py
gallery_dl/extractor/lexica.py
gallery_dl/extractor/lightroom.py
gallery_dl/extractor/livedoor.py
+gallery_dl/extractor/lofter.py
gallery_dl/extractor/lolisafe.py
gallery_dl/extractor/luscious.py
gallery_dl/extractor/lynxchan.py
@@ -244,6 +245,7 @@ gallery_dl/extractor/wikifeet.py
gallery_dl/extractor/wikimedia.py
gallery_dl/extractor/xhamster.py
gallery_dl/extractor/xvideos.py
+gallery_dl/extractor/yiffverse.py
gallery_dl/extractor/ytdl.py
gallery_dl/extractor/zerochan.py
gallery_dl/extractor/zzup.py
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 54750ac..c8aeef8 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -144,7 +144,16 @@ class HttpDownloader(DownloaderBase):
proxies=self.proxies,
verify=self.verify,
)
- except (ConnectionError, Timeout) as exc:
+ except ConnectionError as exc:
+ try:
+ reason = exc.args[0].reason
+ cls = reason.__class__.__name__
+ pre, _, err = str(reason.args[-1]).partition(":")
+ msg = "{}: {}".format(cls, (err or pre).lstrip())
+ except Exception:
+ msg = str(exc)
+ continue
+ except Timeout as exc:
msg = str(exc)
continue
except Exception as exc:
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 8d5f3d0..d003a61 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -98,6 +98,7 @@ modules = [
"lexica",
"lightroom",
"livedoor",
+ "lofter",
"luscious",
"lynxchan",
"mangadex",
@@ -195,6 +196,7 @@ modules = [
"wikimedia",
"xhamster",
"xvideos",
+ "yiffverse",
"zerochan",
"zzup",
"booru",
diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py
index d5c419e..b9de165 100644
--- a/gallery_dl/extractor/bilibili.py
+++ b/gallery_dl/extractor/bilibili.py
@@ -23,7 +23,8 @@ class BilibiliExtractor(Extractor):
class BilibiliUserArticlesExtractor(BilibiliExtractor):
"""Extractor for a bilibili user's articles"""
subcategory = "user-articles"
- pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article"
+ pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)"
+ r"/(?:article|upload/opus)")
example = "https://space.bilibili.com/12345/article"
def items(self):
@@ -56,6 +57,13 @@ class BilibiliArticleExtractor(BilibiliExtractor):
article["username"] = modules["module_author"]["name"]
pics = []
+
+ if "module_top" in modules:
+ try:
+ pics.extend(modules["module_top"]["display"]["album"]["pics"])
+ except Exception:
+ pass
+
for paragraph in modules['module_content']['paragraphs']:
if "pic" not in paragraph:
continue
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index f60ea15..f8fef93 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -204,6 +204,8 @@ class BlueskyUserExtractor(BlueskyExtractor):
def items(self):
base = "{}/profile/{}/".format(self.root, self.user)
+ default = ("posts" if self.config("quoted", False) or
+ self.config("reposts", False) else "media")
return self._dispatch_extractors((
(BlueskyInfoExtractor , base + "info"),
(BlueskyAvatarExtractor , base + "avatar"),
@@ -212,7 +214,7 @@ class BlueskyUserExtractor(BlueskyExtractor):
(BlueskyRepliesExtractor , base + "replies"),
(BlueskyMediaExtractor , base + "media"),
(BlueskyLikesExtractor , base + "likes"),
- ), ("media",))
+ ), (default,))
class BlueskyPostsExtractor(BlueskyExtractor):
diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py
index 0524239..6a43224 100644
--- a/gallery_dl/extractor/cohost.py
+++ b/gallery_dl/extractor/cohost.py
@@ -19,7 +19,7 @@ class CohostExtractor(Extractor):
category = "cohost"
root = "https://cohost.org"
directory_fmt = ("{category}", "{postingProject[handle]}")
- filename_fmt = ("{postId}_{headline:?/_/[b:200]}{num}.{extension}")
+ filename_fmt = ("{postId}{headline:?_//[b:200]}{num:?_//}.{extension}")
archive_fmt = "{postId}_{num}"
def _init(self):
@@ -28,6 +28,14 @@ class CohostExtractor(Extractor):
self.shares = self.config("shares", False)
self.asks = self.config("asks", True)
+ self.avatar = self.config("avatar", False)
+ if self.avatar:
+ self._urls_avatar = {None, ""}
+
+ self.background = self.config("background", False)
+ if self.background:
+ self._urls_background = {None, ""}
+
def items(self):
for post in self.posts():
reason = post.get("limitedVisibilityReason")
@@ -43,6 +51,26 @@ class CohostExtractor(Extractor):
post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
yield Message.Directory, post
+
+ project = post["postingProject"]
+ if self.avatar:
+ url = project.get("avatarURL")
+ if url not in self._urls_avatar:
+ self._urls_avatar.add(url)
+ p = post.copy()
+ p["postId"] = p["kind"] = "avatar"
+ p["headline"] = p["num"] = ""
+ yield Message.Url, url, text.nameext_from_url(url, p)
+
+ if self.background:
+ url = project.get("headerURL")
+ if url not in self._urls_background:
+ self._urls_background.add(url)
+ p = post.copy()
+ p["postId"] = p["kind"] = "background"
+ p["headline"] = p["num"] = ""
+ yield Message.Url, url, text.nameext_from_url(url, p)
+
for post["num"], file in enumerate(files, 1):
url = file["fileURL"]
post.update(file)
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 5f9d355..5ada030 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -42,8 +42,7 @@ class Extractor():
ciphers = None
tls12 = True
browser = None
- useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:128.0) Gecko/20100101 Firefox/128.0")
+ useragent = util.USERAGENT_FIREFOX
request_interval = 0.0
request_interval_min = 0.0
request_interval_429 = 60.0
@@ -172,8 +171,16 @@ class Extractor():
while True:
try:
response = session.request(method, url, **kwargs)
- except (requests.exceptions.ConnectionError,
- requests.exceptions.Timeout,
+ except requests.exceptions.ConnectionError as exc:
+ code = 0
+ try:
+ reason = exc.args[0].reason
+ cls = reason.__class__.__name__
+ pre, _, err = str(reason.args[-1]).partition(":")
+ msg = " {}: {}".format(cls, (err or pre).lstrip())
+ except Exception:
+ msg = exc
+ except (requests.exceptions.Timeout,
requests.exceptions.ChunkedEncodingError,
requests.exceptions.ContentDecodingError) as exc:
msg = exc
@@ -212,6 +219,11 @@ class Extractor():
if b'name="captcha-bypass"' in content:
self.log.warning("Cloudflare CAPTCHA")
break
+ elif server and server.startswith("ddos-guard") and \
+ code == 403:
+ if b"/ddos-guard/js-challenge/" in response.content:
+ self.log.warning("DDoS-Guard challenge")
+ break
if code == 429 and self._handle_429(response):
continue
@@ -909,10 +921,11 @@ def _browser_useragent():
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- server.bind(("127.0.0.1", 6414))
+ server.bind(("127.0.0.1", 0))
server.listen(1)
- webbrowser.open("http://127.0.0.1:6414/user-agent")
+ host, port = server.getsockname()
+ webbrowser.open("http://{}:{}/user-agent".format(host, port))
client = server.accept()[0]
server.close()
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index a514696..e150829 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -10,12 +10,15 @@ from . import lolisafe
from .common import Message
from .. import text
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)"
+
class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
+ """Extractor for cyberdrop albums"""
category = "cyberdrop"
root = "https://cyberdrop.me"
root_api = "https://api.cyberdrop.me"
- pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)"
+ pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://cyberdrop.me/a/ID"
def items(self):
@@ -40,7 +43,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
extr('id="title"', "")
album = {
- "album_id" : self.album_id,
+ "album_id" : album_id,
"album_name" : text.unescape(extr('title="', '"')),
"album_size" : text.parse_bytes(extr(
'<p class="title">', "B")),
@@ -67,3 +70,20 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
continue
yield file
+
+
+class CyberdropMediaExtractor(CyberdropAlbumExtractor):
+ """Extractor for cyberdrop media links"""
+ subcategory = "media"
+ directory_fmt = ("{category}",)
+ pattern = BASE_PATTERN + r"/f/([^/?#]+)"
+ example = "https://cyberdrop.me/f/ID"
+
+ def fetch_album(self, album_id):
+ return self._extract_files((album_id,)), {
+ "album_id" : "",
+ "album_name" : "",
+ "album_size" : -1,
+ "description": "",
+ "count" : 1,
+ }
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index ea3f13d..69934b4 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -451,6 +451,26 @@ class DeviantartExtractor(Extractor):
elif type == "text":
self._tiptap_process_text(html, content)
+ elif type == "heading":
+ attrs = content["attrs"]
+ level = str(attrs.get("level") or "3")
+
+ html.append("<h")
+ html.append(level)
+ html.append(' style="text-align:')
+ html.append(attrs.get("textAlign") or "left")
+ html.append('">')
+ html.append('<span style="margin-inline-start:0px">')
+
+ children = content.get("content")
+ if children:
+ for block in children:
+ self._tiptap_process_content(html, block)
+
+ html.append("</span></h")
+ html.append(level)
+ html.append(">")
+
elif type == "hardBreak":
html.append("<br/><br/>")
@@ -478,8 +498,9 @@ class DeviantartExtractor(Extractor):
for mark in marks:
type = mark["type"]
if type == "link":
+ attrs = mark.get("attrs") or {}
html.append('<a href="')
- html.append(text.escape(mark["attrs"]["href"]))
+ html.append(text.escape(attrs.get("href") or ""))
html.append('" rel="noopener noreferrer nofollow ugc">')
close.append("</a>")
elif type == "bold":
@@ -491,6 +512,9 @@ class DeviantartExtractor(Extractor):
elif type == "underline":
html.append("<u>")
close.append("</u>")
+ elif type == "strike":
+ html.append("<s>")
+ close.append("</s>")
elif type == "textStyle" and len(mark) <= 1:
pass
else:
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
index 04acfc5..2f3fdbf 100644
--- a/gallery_dl/extractor/facebook.py
+++ b/gallery_dl/extractor/facebook.py
@@ -40,7 +40,8 @@ class FacebookExtractor(Extractor):
@staticmethod
def decode_all(txt):
return text.unescape(
- txt.encode("utf-8").decode("unicode_escape")
+ txt.encode().decode("unicode_escape")
+ .encode("utf_16", "surrogatepass").decode("utf_16")
).replace("\\/", "/")
@staticmethod
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index e6b6b14..8c5b180 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -163,21 +163,14 @@ class InstagramExtractor(Extractor):
"post_id": reel_id,
"post_shortcode": shortcode_from_id(reel_id),
}
-
if "title" in post:
data["highlight_title"] = post["title"]
- if "created_at" in post:
- data["post_date"] = data["date"] = text.parse_timestamp(
- post.get("created_at"))
else: # regular image/video post
- date = text.parse_timestamp(post.get("taken_at"))
data = {
"post_id" : post["pk"],
"post_shortcode": post["code"],
"post_url": "{}/p/{}/".format(self.root, post["code"]),
- "post_date": date,
- "date": date,
"likes": post.get("like_count", 0),
"pinned": post.get("timeline_pinned_user_ids", ()),
"liked": post.get("has_liked", False),
@@ -218,7 +211,8 @@ class InstagramExtractor(Extractor):
data["owner_id"] = owner["pk"]
data["username"] = owner.get("username")
data["fullname"] = owner.get("full_name")
-
+ data["post_date"] = data["date"] = text.parse_timestamp(
+ post.get("taken_at") or post.get("created_at") or post.get("seen"))
data["_files"] = files = []
for num, item in enumerate(items, 1):
diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py
index 1aef66e..7f941bb 100644
--- a/gallery_dl/extractor/itaku.py
+++ b/gallery_dl/extractor/itaku.py
@@ -78,6 +78,16 @@ class ItakuImageExtractor(ItakuExtractor):
return (self.api.image(self.item),)
+class ItakuSearchExtractor(ItakuExtractor):
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/home/images/?\?([^#]+)"
+ example = "https://itaku.ee/home/images?tags=SEARCH"
+
+ def posts(self):
+ params = text.parse_query_list(self.item)
+ return self.api.search_images(params)
+
+
class ItakuAPI():
def __init__(self, extractor):
@@ -87,6 +97,42 @@ class ItakuAPI():
"Accept": "application/json, text/plain, */*",
}
+ def search_images(self, params):
+ endpoint = "/galleries/images/"
+ required_tags = []
+ negative_tags = []
+ optional_tags = []
+
+ tags = params.pop("tags", None)
+ if not tags:
+ tags = ()
+ elif isinstance(tags, str):
+ tags = (tags,)
+
+ for tag in tags:
+ if not tag:
+ pass
+ elif tag[0] == "-":
+ negative_tags.append(tag[1:])
+ elif tag[0] == "~":
+ optional_tags.append(tag[1:])
+ else:
+ required_tags.append(tag)
+
+ api_params = {
+ "required_tags": required_tags,
+ "negative_tags": negative_tags,
+ "optional_tags": optional_tags,
+ "date_range": "",
+ "maturity_rating": ("SFW", "Questionable", "NSFW"),
+ "ordering" : "-date_added",
+ "page" : "1",
+ "page_size" : "30",
+ "visibility": ("PUBLIC", "PROFILE_ONLY"),
+ }
+ api_params.update(params)
+ return self._pagination(endpoint, api_params, self.image)
+
def galleries_images(self, username, section=None):
endpoint = "/galleries/images/"
params = {
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 16c5b99..a7caca9 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -433,8 +433,8 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
class KemonopartyFavoriteExtractor(KemonopartyExtractor):
"""Extractor for kemono.su favorites"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/favorites()()(?:/?\?([^#]+))?"
- example = "https://kemono.su/favorites"
+ pattern = BASE_PATTERN + r"/(?:account/)?favorites()()(?:/?\?([^#]+))?"
+ example = "https://kemono.su/account/favorites/artists"
def items(self):
self._prepare_ddosguard_cookies()
diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py
new file mode 100644
index 0000000..412b6b9
--- /dev/null
+++ b/gallery_dl/extractor/lofter.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.lofter.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+
+
+class LofterExtractor(Extractor):
+ """Base class for lofter extractors"""
+ category = "lofter"
+ root = "https://www.lofter.com"
+ directory_fmt = ("{category}", "{blog_name}")
+ filename_fmt = "{id}_{num}.{extension}"
+ archive_fmt = "{id}_{num}"
+
+ def _init(self):
+ self.api = LofterAPI(self)
+
+ def items(self):
+ for post in self.posts():
+ if "post" in post:
+ post = post["post"]
+
+ post["blog_name"] = post["blogInfo"]["blogName"]
+ post["date"] = text.parse_timestamp(post["publishTime"] // 1000)
+ post_type = post["type"]
+
+ # Article
+ if post_type == 1:
+ content = post["content"]
+ image_urls = text.extract_iter(content, '<img src="', '"')
+ image_urls = [text.unescape(x) for x in image_urls]
+ image_urls = [x.partition("?")[0] for x in image_urls]
+
+ # Photo
+ elif post_type == 2:
+ photo_links = util.json_loads(post["photoLinks"])
+ image_urls = [x["orign"] for x in photo_links]
+ image_urls = [x.partition("?")[0] for x in image_urls]
+
+ # Video
+ elif post_type == 4:
+ embed = util.json_loads(post["embed"])
+ image_urls = [embed["originUrl"]]
+
+ # Answer
+ elif post_type == 5:
+ images = util.json_loads(post["images"])
+ image_urls = [x["orign"] for x in images]
+ image_urls = [x.partition("?")[0] for x in image_urls]
+
+ else:
+ image_urls = ()
+ self.log.warning(
+ "%s: Unsupported post type '%s'.",
+ post["id"], post_type)
+
+ post["count"] = len(image_urls)
+ yield Message.Directory, post
+ for post["num"], url in enumerate(image_urls, 1):
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ def posts(self):
+ return ()
+
+
+class LofterPostExtractor(LofterExtractor):
+ """Extractor for a lofter post"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?[\w-]+\.lofter\.com/post/([0-9a-f]+)_([0-9a-f]+)"
+ example = "https://BLOG.lofter.com/post/12345678_90abcdef"
+
+ def posts(self):
+ blog_id, post_id = self.groups
+ post = self.api.post(int(blog_id, 16), int(post_id, 16))
+ return (post,)
+
+
+class LofterBlogPostsExtractor(LofterExtractor):
+ """Extractor for a lofter blog's posts"""
+ subcategory = "blog-posts"
+ pattern = (r"(?:https?://)?(?:"
+ # https://www.lofter.com/front/blog/home-page/<blog_name>
+ r"www\.lofter\.com/front/blog/home-page/([\w-]+)|"
+ # https://<blog_name>.lofter.com/
+ r"([\w-]+)\.lofter\.com"
+ r")/?(?:$|\?|#)")
+ example = "https://BLOG.lofter.com/"
+
+ def posts(self):
+ blog_name = self.groups[0] or self.groups[1]
+ return self.api.blog_posts(blog_name)
+
+
+class LofterAPI():
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+
+ def blog_posts(self, blog_name):
+ endpoint = "/v2.0/blogHomePage.api"
+ params = {
+ "method": "getPostLists",
+ "offset": 0,
+ "limit": 200,
+ "blogdomain": blog_name + ".lofter.com",
+ }
+ return self._pagination(endpoint, params)
+
+ def post(self, blog_id, post_id):
+ endpoint = "/oldapi/post/detail.api"
+ params = {
+ "targetblogid": blog_id,
+ "postid": post_id,
+ }
+ return self._call(endpoint, params)["posts"][0]
+
+ def _call(self, endpoint, data):
+ url = "https://api.lofter.com" + endpoint
+ params = {
+ 'product': 'lofter-android-7.9.10'
+ }
+ response = self.extractor.request(
+ url, method="POST", params=params, data=data)
+ info = response.json()
+
+ if info["meta"]["status"] != 200:
+ self.extractor.log.debug("Server response: %s", info)
+ raise exception.StopExtraction("API request failed")
+
+ return info["response"]
+
+ def _pagination(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+ posts = data["posts"]
+
+ yield from posts
+
+ if params["offset"] + len(posts) < data["offset"]:
+ break
+ params["offset"] = data["offset"]
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
index 4156484..1883bbc 100644
--- a/gallery_dl/extractor/recursive.py
+++ b/gallery_dl/extractor/recursive.py
@@ -9,6 +9,7 @@
"""Recursive extractor"""
from .common import Extractor, Message
+from .. import text
import re
@@ -25,7 +26,7 @@ class RecursiveExtractor(Extractor):
with open(url[7:]) as fp:
page = fp.read()
else:
- page = self.request(url).text
+ page = self.request(text.ensure_http_scheme(url)).text
for match in re.finditer(r"https?://[^\s\"']+", page):
yield Message.Queue, match.group(0), {}
diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py
index 784cdc0..1c62d75 100644
--- a/gallery_dl/extractor/saint.py
+++ b/gallery_dl/extractor/saint.py
@@ -11,7 +11,7 @@
from .lolisafe import LolisafeAlbumExtractor
from .. import text
-BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|to)"
+BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|cr|to)"
class SaintAlbumExtractor(LolisafeAlbumExtractor):
diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py
index 167953d..e756385 100644
--- a/gallery_dl/extractor/tapas.py
+++ b/gallery_dl/extractor/tapas.py
@@ -30,44 +30,6 @@ class TapasExtractor(Extractor):
if self._cache is None:
TapasExtractor._cache = {}
- def items(self):
- self.login()
- headers = {"Accept": "application/json, text/javascript, */*;"}
-
- for episode_id in self.episode_ids():
- url = "{}/episode/{}".format(self.root, episode_id)
- data = self.request(url, headers=headers).json()["data"]
-
- episode = data["episode"]
- if not episode.get("free") and not episode.get("unlocked"):
- raise exception.StopExtraction(
- "Episode '%s' not unlocked (ID %s) ",
- episode["title"], episode_id)
-
- html = data["html"]
- series_id = text.rextract(html, 'data-series-id="', '"')[0]
- try:
- episode["series"] = self._cache[series_id]
- except KeyError:
- url = "{}/series/{}".format(self.root, series_id)
- episode["series"] = self._cache[series_id] = self.request(
- url, headers=headers).json()["data"]
-
- episode["date"] = text.parse_datetime(episode["publish_date"])
- yield Message.Directory, episode
-
- if episode["book"]:
- content, _ = text.extract(
- html, '<div class="viewer">', '<div class="viewer-bottom')
- episode["num"] = 1
- episode["extension"] = "html"
- yield Message.Url, "text:" + content, episode
-
- else: # comic
- for episode["num"], url in enumerate(text.extract_iter(
- html, 'data-src="', '"'), 1):
- yield Message.Url, url, text.nameext_from_url(url, episode)
-
def login(self):
if self.cookies_check(self.cookies_names):
return
@@ -103,24 +65,70 @@ class TapasExtractor(Extractor):
return {"_cpc_": response.history[0].cookies.get("_cpc_")}
+ def request_api(self, url, params=None):
+ headers = {"Accept": "application/json, text/javascript, */*;"}
+ return self.request(url, params=params, headers=headers).json()["data"]
+
+
+class TapasEpisodeExtractor(TapasExtractor):
+ subcategory = "episode"
+ pattern = BASE_PATTERN + r"/episode/(\d+)"
+ example = "https://tapas.io/episode/12345"
+
+ def items(self):
+ self.login()
+
+ episode_id = self.groups[0]
+ url = "{}/episode/{}".format(self.root, episode_id)
+ data = self.request_api(url)
+
+ episode = data["episode"]
+ if not episode.get("free") and not episode.get("unlocked"):
+ raise exception.AuthorizationError(
+ "%s: Episode '%s' not unlocked",
+ episode_id, episode["title"])
+
+ html = data["html"]
+ episode["series"] = self._extract_series(html)
+ episode["date"] = text.parse_datetime(episode["publish_date"])
+ yield Message.Directory, episode
+
+ if episode["book"]:
+ content = text.extr(
+ html, '<div class="viewer">', '<div class="viewer-bottom')
+ episode["num"] = 1
+ episode["extension"] = "html"
+ yield Message.Url, "text:" + content, episode
+
+ else: # comic
+ for episode["num"], url in enumerate(text.extract_iter(
+ html, 'data-src="', '"'), 1):
+ yield Message.Url, url, text.nameext_from_url(url, episode)
+
+ def _extract_series(self, html):
+ series_id = text.rextract(html, 'data-series-id="', '"')[0]
+ try:
+ return self._cache[series_id]
+ except KeyError:
+ url = "{}/series/{}".format(self.root, series_id)
+ series = self._cache[series_id] = self.request_api(url)
+ return series
+
class TapasSeriesExtractor(TapasExtractor):
subcategory = "series"
pattern = BASE_PATTERN + r"/series/([^/?#]+)"
example = "https://tapas.io/series/TITLE"
- def __init__(self, match):
- TapasExtractor.__init__(self, match)
- self.series_name = match.group(1)
+ def items(self):
+ self.login()
- def episode_ids(self):
- url = "{}/series/{}".format(self.root, self.series_name)
- series_id, _, episode_id = text.extract(
+ url = "{}/series/{}".format(self.root, self.groups[0])
+ series_id, _, episode_id = text.extr(
self.request(url).text, 'content="tapastic://series/', '"',
- )[0].partition("/episodes/")
+ ).partition("/episodes/")
url = "{}/series/{}/episodes".format(self.root, series_id)
- headers = {"Accept": "application/json, text/javascript, */*;"}
params = {
"eid" : episode_id,
"page" : 1,
@@ -129,36 +137,26 @@ class TapasSeriesExtractor(TapasExtractor):
"max_limit" : "20",
}
+ base = self.root + "/episode/"
while True:
- data = self.request(
- url, params=params, headers=headers).json()["data"]
- yield from text.extract_iter(
- data["body"], 'data-href="/episode/', '"')
+ data = self.request_api(url, params)
+ for episode in data["episodes"]:
+ episode["_extractor"] = TapasEpisodeExtractor
+ yield Message.Queue, base + str(episode["id"]), episode
if not data["pagination"]["has_next"]:
return
params["page"] += 1
-class TapasEpisodeExtractor(TapasExtractor):
- subcategory = "episode"
- pattern = BASE_PATTERN + r"/episode/(\d+)"
- example = "https://tapas.io/episode/12345"
-
- def __init__(self, match):
- TapasExtractor.__init__(self, match)
- self.episode_id = match.group(1)
-
- def episode_ids(self):
- return (self.episode_id,)
-
-
class TapasCreatorExtractor(TapasExtractor):
subcategory = "creator"
pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)"
example = "https://tapas.io/CREATOR"
def items(self):
+ self.login()
+
url = "{}/{}/series".format(self.root, self.groups[0])
page = self.request(url).text
page = text.extr(page, '<ul class="content-list-wrap', "</ul>")
diff --git a/gallery_dl/extractor/yiffverse.py b/gallery_dl/extractor/yiffverse.py
new file mode 100644
index 0000000..2b14341
--- /dev/null
+++ b/gallery_dl/extractor/yiffverse.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://yiffverse.com/"""
+
+from .booru import BooruExtractor
+from .. import text
+import collections
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?yiffverse\.com"
+
+
+class YiffverseExtractor(BooruExtractor):
+ category = "yiffverse"
+ root = "https://yiffverse.com"
+ root_cdn = "https://furry34com.b-cdn.net"
+ filename_fmt = "{category}_{id}.{extension}"
+ per_page = 30
+
+ TAG_TYPES = {
+ None: "general",
+ 1 : "general",
+ 2 : "copyright",
+ 4 : "character",
+ 8 : "artist",
+ }
+ FORMATS = (
+ ("100", "mov.mp4"),
+ ("101", "mov720.mp4"),
+ ("102", "mov480.mp4"),
+ ("10" , "pic.jpg"),
+ )
+
+ def _file_url(self, post):
+ files = post["files"]
+ for fmt, extension in self.FORMATS:
+ if fmt in files:
+ break
+ else:
+ fmt = next(iter(files))
+
+ post_id = post["id"]
+ root = self.root_cdn if files[fmt][0] else self.root
+ post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
+ root, post_id // 1000, post_id, post_id, extension)
+ post["format_id"] = fmt
+ post["format"] = extension.partition(".")[0]
+
+ return url
+
+ def _prepare(self, post):
+ post.pop("files", None)
+ post["date"] = text.parse_datetime(
+ post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ post["filename"], _, post["format"] = post["filename"].rpartition(".")
+ if "tags" in post:
+ post["tags"] = [t["value"] for t in post["tags"]]
+
+ def _tags(self, post, _):
+ if "tags" not in post:
+ post.update(self._fetch_post(post["id"]))
+
+ tags = collections.defaultdict(list)
+ for tag in post["tags"]:
+ tags[tag["type"]].append(tag["value"])
+ types = self.TAG_TYPES
+ for type, values in tags.items():
+ post["tags_" + types[type]] = values
+
+ def _fetch_post(self, post_id):
+ url = "{}/api/v2/post/{}".format(self.root, post_id)
+ return self.request(url).json()
+
+ def _pagination(self, endpoint, params=None):
+ url = "{}/api{}".format(self.root, endpoint)
+
+ if params is None:
+ params = {}
+ params["sortOrder"] = 1
+ params["status"] = 2
+ params["take"] = self.per_page
+ threshold = self.per_page
+
+ while True:
+ data = self.request(url, method="POST", json=params).json()
+
+ yield from data["items"]
+
+ if len(data["items"]) < threshold:
+ return
+ params["cursor"] = data.get("cursor")
+
+
+class YiffversePostExtractor(YiffverseExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ example = "https://yiffverse.com/post/12345"
+
+ def posts(self):
+ return (self._fetch_post(self.groups[0]),)
+
+
+class YiffversePlaylistExtractor(YiffverseExtractor):
+ subcategory = "playlist"
+ directory_fmt = ("{category}", "{playlist_id}")
+ archive_fmt = "p_{playlist_id}_{id}"
+ pattern = BASE_PATTERN + r"/playlist/(\d+)"
+ example = "https://yiffverse.com/playlist/12345"
+
+ def metadata(self):
+ return {"playlist_id": self.groups[0]}
+
+ def posts(self):
+ endpoint = "/v2/post/search/playlist/" + self.groups[0]
+ return self._pagination(endpoint)
+
+
+class YiffverseTagExtractor(YiffverseExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)"
+ example = "https://yiffverse.com/tag/TAG"
+
+ def _init(self):
+ tag, query = self.groups
+ params = text.parse_query(query)
+
+ self.tags = tags = []
+ if tag:
+ tags.append(text.unquote(tag))
+ if "tags" in params:
+ tags.extend(params["tags"].split("|"))
+
+ type = params.get("type")
+ if type == "video":
+ self.type = 1
+ elif type == "image":
+ self.type = 0
+ else:
+ self.type = None
+
+ def metadata(self):
+ return {"search_tags": " ".join(self.tags)}
+
+ def posts(self):
+ endpoint = "/v2/post/search/root"
+ params = {"includeTags": [t.replace("_", " ") for t in self.tags]}
+ if self.type is not None:
+ params["type"] = self.type
+ return self._pagination(endpoint, params)
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 4c4fb3a..bc135ad 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -78,8 +78,8 @@ class ZerochanExtractor(BooruExtractor):
'class="breadcrumbs', '</nav>'))[2:],
"uploader": extr('href="/user/', '"'),
"tags" : extr('<ul id="tags"', '</ul>'),
- "source" : text.unescape(text.extr(
- extr('id="source-url"', '</a>'), 'href="', '"')),
+ "source" : text.unescape(text.remove_html(extr(
+ 'id="source-url"', '</p>').rpartition("</s>")[2])),
}
html = data["tags"]
@@ -93,14 +93,12 @@ class ZerochanExtractor(BooruExtractor):
def _parse_entry_api(self, entry_id):
url = "{}/{}?json".format(self.root, entry_id)
- text = self.request(url).text
+ txt = self.request(url).text
try:
- item = util.json_loads(text)
- except ValueError as exc:
- if " control character " not in str(exc):
- raise
- text = re.sub(r"[\x00-\x1f\x7f]", "", text)
- item = util.json_loads(text)
+ item = util.json_loads(txt)
+ except ValueError:
+ item = self._parse_json(txt)
+ item["id"] = text.parse_int(entry_id)
data = {
"id" : item["id"],
@@ -118,6 +116,27 @@ class ZerochanExtractor(BooruExtractor):
return data
+ def _parse_json(self, txt):
+ txt = re.sub(r"[\x00-\x1f\x7f]", "", txt)
+ main, _, tags = txt.partition('tags": [')
+
+ item = {}
+ for line in main.split(', "')[1:]:
+ key, _, value = line.partition('": ')
+ if value:
+ if value[0] == '"':
+ value = value[1:-1]
+ else:
+ value = text.parse_int(value)
+ if key:
+ item[key] = value
+
+ item["tags"] = tags = tags[5:].split('", "')
+ if tags:
+ tags[-1] = tags[-1][:-5]
+
+ return item
+
def _tags(self, post, page):
tags = collections.defaultdict(list)
for tag in post["tags"]:
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index c41f382..2914927 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -59,6 +59,7 @@ class Job():
for category in parents:
cat = "{}>{}".format(category, extr.category)
cfgpath.append((cat, extr.subcategory))
+ cfgpath.append((category + ">*", extr.subcategory))
cfgpath.append((extr.category, extr.subcategory))
self.parents = parents
else:
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 3cbe510..72ec98e 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -647,13 +647,19 @@ class CustomNone():
__repr__ = __str__
+# v128.0 release on 2024-07-09 has ordinal 739076
+# 735492 == 739076 - 128 * 28
+_ff_ver = (datetime.date.today().toordinal() - 735492) // 28
+
NONE = CustomNone()
EPOCH = datetime.datetime(1970, 1, 1)
SECOND = datetime.timedelta(0, 1)
WINDOWS = (os.name == "nt")
SENTINEL = object()
-USERAGENT = "gallery-dl/" + version.__version__
EXECUTABLE = getattr(sys, "frozen", False)
+USERAGENT = "gallery-dl/" + version.__version__
+USERAGENT_FIREFOX = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{}.0) "
+ "Gecko/20100101 Firefox/{}.0").format(_ff_ver, _ff_ver)
SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"}
GLOBALS = {
"contains" : contains,
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 2dab0d6..651745a 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,5 +6,5 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.28.1"
+__version__ = "1.28.2"
__variant__ = None