From bb8260277ab7483652c6c1526a15d62da92acc96 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 22 Dec 2024 05:45:18 -0500 Subject: New upstream version 1.28.2. --- CHANGELOG.md | 47 ++++++----- PKG-INFO | 6 +- README.rst | 4 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 32 +++++++- docs/gallery-dl.conf | 2 + gallery_dl.egg-info/PKG-INFO | 6 +- gallery_dl.egg-info/SOURCES.txt | 2 + gallery_dl/downloader/http.py | 11 ++- gallery_dl/extractor/__init__.py | 2 + gallery_dl/extractor/bilibili.py | 10 ++- gallery_dl/extractor/bluesky.py | 4 +- gallery_dl/extractor/cohost.py | 30 ++++++- gallery_dl/extractor/common.py | 25 ++++-- gallery_dl/extractor/cyberdrop.py | 24 +++++- gallery_dl/extractor/deviantart.py | 26 +++++- gallery_dl/extractor/facebook.py | 3 +- gallery_dl/extractor/instagram.py | 10 +-- gallery_dl/extractor/itaku.py | 46 +++++++++++ gallery_dl/extractor/kemonoparty.py | 4 +- gallery_dl/extractor/lofter.py | 147 +++++++++++++++++++++++++++++++++ gallery_dl/extractor/recursive.py | 3 +- gallery_dl/extractor/saint.py | 2 +- gallery_dl/extractor/tapas.py | 124 ++++++++++++++-------------- gallery_dl/extractor/yiffverse.py | 157 ++++++++++++++++++++++++++++++++++++ gallery_dl/extractor/zerochan.py | 37 ++++++--- gallery_dl/job.py | 1 + gallery_dl/util.py | 8 +- gallery_dl/version.py | 2 +- 29 files changed, 645 insertions(+), 132 deletions(-) create mode 100644 gallery_dl/extractor/lofter.py create mode 100644 gallery_dl/extractor/yiffverse.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b831cd4..2df827d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,26 +1,31 @@ -## 1.28.1 - 2024-12-07 +## 1.28.2 - 2024-12-20 ### Extractors #### Additions -- [bluesky] add `info` extractor +- [cyberdrop] add extractor for media URLs ([#2496](https://github.com/mikf/gallery-dl/issues/2496)) +- [itaku] add `search` extractor ([#6613](https://github.com/mikf/gallery-dl/issues/6613)) +- [lofter] add initial support ([#650](https://github.com/mikf/gallery-dl/issues/650), [#2294](https://github.com/mikf/gallery-dl/issues/2294), [#4095](https://github.com/mikf/gallery-dl/issues/4095), [#4728](https://github.com/mikf/gallery-dl/issues/4728), [#5656](https://github.com/mikf/gallery-dl/issues/5656), [#6607](https://github.com/mikf/gallery-dl/issues/6607)) +- [yiffverse] add support ([#6611](https://github.com/mikf/gallery-dl/issues/6611)) #### Fixes -- [bluesky] fix exception when encountering non-quote embeds ([#6577](https://github.com/mikf/gallery-dl/issues/6577)) -- [bluesky] unescape search queries ([#6579](https://github.com/mikf/gallery-dl/issues/6579)) -- [common] restore using environment proxy settings by default ([#6553](https://github.com/mikf/gallery-dl/issues/6553), [#6609](https://github.com/mikf/gallery-dl/issues/6609)) -- [common] improve handling of `user-agent` settings ([#6594](https://github.com/mikf/gallery-dl/issues/6594)) -- [e621] fix `TypeError` when `metadata` is enabled ([#6587](https://github.com/mikf/gallery-dl/issues/6587)) -- [gofile] fix website token extraction ([#6596](https://github.com/mikf/gallery-dl/issues/6596)) -- [inkbunny] fix re-login loop ([#6618](https://github.com/mikf/gallery-dl/issues/6618)) -- [instagram] handle empty `carousel_media` entries ([#6595](https://github.com/mikf/gallery-dl/issues/6595)) -- [kemonoparty] fix `o` query parameter handling ([#6597](https://github.com/mikf/gallery-dl/issues/6597)) -- [nhentai] fix download URLs ([#6620](https://github.com/mikf/gallery-dl/issues/6620)) -- [readcomiconline] fix `chapter` extraction ([#6070](https://github.com/mikf/gallery-dl/issues/6070), [#6335](https://github.com/mikf/gallery-dl/issues/6335)) -- [realbooru] fix extraction ([#6543](https://github.com/mikf/gallery-dl/issues/6543)) -- [rule34] fix `favorite` extraction ([#6573](https://github.com/mikf/gallery-dl/issues/6573)) -- [zerochan] download `.webp` and `.gif` files ([#6576](https://github.com/mikf/gallery-dl/issues/6576)) +- [facebook] decode Unicode surrogate pairs in metadata values ([#6599](https://github.com/mikf/gallery-dl/issues/6599)) +- [zerochan] parse API responses manually when receiving invalid JSON ([#6632](https://github.com/mikf/gallery-dl/issues/6632)) +- [zerochan] fix `source` metadata extraction when not logged in #### Improvements -- [hentaicosplays] update domains ([#6578](https://github.com/mikf/gallery-dl/issues/6578)) -- [pixiv:ranking] implement filtering results by `content` ([#6574](https://github.com/mikf/gallery-dl/issues/6574)) -- [pixiv] include user ID in failed AJAX request warnings ([#6581](https://github.com/mikf/gallery-dl/issues/6581)) +- [bilibili] extract files from `module_top` entries ([#6687](https://github.com/mikf/gallery-dl/issues/6687)) +- [bilibili] support `/upload/opus` URLs ([#6687](https://github.com/mikf/gallery-dl/issues/6687)) +- [bluesky] default to `posts` timeline when `reposts` or `quoted` is enabled ([#6583](https://github.com/mikf/gallery-dl/issues/6583)) +- [common] simplify HTTP error messages +- [common] detect `DDoS-Guard` challenge pages +- [deviantart] improve `tiptap` markup to HTML conversion ([#6686](https://github.com/mikf/gallery-dl/issues/6686)) + - fix `KeyError: 'attrs'` for links without `href` + - support `heading` content blocks + - support `strike` text markers +- [instagram] extract `date` metadata for stories ([#6677](https://github.com/mikf/gallery-dl/issues/6677)) +- [kemonoparty:favorite] support new URL format ([#6676](https://github.com/mikf/gallery-dl/issues/6676)) +- [saint] support `saint2.cr` URLs ([#6692](https://github.com/mikf/gallery-dl/issues/6692)) +- [tapas] improve extractor hierarchy ([#6680](https://github.com/mikf/gallery-dl/issues/6680)) #### Options -- [patreon] add `format-images` option ([#6569](https://github.com/mikf/gallery-dl/issues/6569)) -- [zerochan] add `extensions` option ([#6576](https://github.com/mikf/gallery-dl/issues/6576)) +- [cohost] add `avatar` and `background` options ([#6656](https://github.com/mikf/gallery-dl/issues/6656)) +### Miscellaneous +- support `*` wildcards for `parent>child` categories, for example `reddit>*` ([#6673](https://github.com/mikf/gallery-dl/issues/6673)) +- use latest Firefox UA as default `user-agent` +- use random unused port for `"user-agent": "browser"` requests diff --git a/PKG-INFO b/PKG-INFO index f82026d..d5fce98 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.28.1 +Version: 1.28.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -117,9 +117,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/README.rst b/README.rst index 63d400f..240dfe5 100644 --- a/README.rst +++ b/README.rst @@ -76,9 +76,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 96c01a0..3d84f58 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2024-12-07" "1.28.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2024-12-20" "1.28.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index e2c1e14..c27f632 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2024-12-07" "1.28.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2024-12-20" "1.28.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1697,7 +1697,12 @@ Download embedded videos hosted on https://www.blogger.com/ * \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 -\f[I]"media"\f[] +.br +* \f[I]"posts"\f[] if +\f[I]reposts\f[] or +\f[I]quoted\f[] is enabled +.br +* \f[I]"media"\f[] otherwise .IP "Example:" 4 .br @@ -1710,6 +1715,7 @@ A (comma-separated) list of subcategories to include when processing a user profile. Possible values are +\f[I]"info"\f[], \f[I]"avatar"\f[], \f[I]"background"\f[], \f[I]"posts"\f[], @@ -2057,6 +2063,28 @@ to download images in JPEG format at their original resolution. Extract \f[I]ask\f[] posts. +.SS extractor.cohost.avatar +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download \f[I]avatar\f[] images. + + +.SS extractor.cohost.background +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download \f[I]background\f[]/\f[I]banner\f[]/\f[I]header\f[] images. + + .SS extractor.cohost.pinned .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 4dc2e14..3d73869 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -176,6 +176,8 @@ "cohost": { "asks" : true, + "avatar" : false, + "background": false, "pinned" : false, "replies": true, "shares" : true diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index f82026d..d5fce98 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.28.1 +Version: 1.28.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -117,9 +117,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 398c9f7..42dd483 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -139,6 +139,7 @@ gallery_dl/extractor/lensdump.py gallery_dl/extractor/lexica.py gallery_dl/extractor/lightroom.py gallery_dl/extractor/livedoor.py +gallery_dl/extractor/lofter.py gallery_dl/extractor/lolisafe.py gallery_dl/extractor/luscious.py gallery_dl/extractor/lynxchan.py @@ -244,6 +245,7 @@ gallery_dl/extractor/wikifeet.py gallery_dl/extractor/wikimedia.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py +gallery_dl/extractor/yiffverse.py gallery_dl/extractor/ytdl.py gallery_dl/extractor/zerochan.py gallery_dl/extractor/zzup.py diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 54750ac..c8aeef8 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -144,7 +144,16 @@ class HttpDownloader(DownloaderBase): proxies=self.proxies, verify=self.verify, ) - except (ConnectionError, Timeout) as exc: + except ConnectionError as exc: + try: + reason = exc.args[0].reason + cls = reason.__class__.__name__ + pre, _, err = str(reason.args[-1]).partition(":") + msg = "{}: {}".format(cls, (err or pre).lstrip()) + except Exception: + msg = str(exc) + continue + except Timeout as exc: msg = str(exc) continue except Exception as exc: diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8d5f3d0..d003a61 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -98,6 +98,7 @@ modules = [ "lexica", "lightroom", "livedoor", + "lofter", "luscious", "lynxchan", "mangadex", @@ -195,6 +196,7 @@ modules = [ "wikimedia", "xhamster", "xvideos", + "yiffverse", "zerochan", "zzup", "booru", diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index d5c419e..b9de165 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -23,7 +23,8 @@ class BilibiliExtractor(Extractor): class BilibiliUserArticlesExtractor(BilibiliExtractor): """Extractor for a bilibili user's articles""" subcategory = "user-articles" - pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" + pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)" + r"/(?:article|upload/opus)") example = "https://space.bilibili.com/12345/article" def items(self): @@ -56,6 +57,13 @@ class BilibiliArticleExtractor(BilibiliExtractor): article["username"] = modules["module_author"]["name"] pics = [] + + if "module_top" in modules: + try: + pics.extend(modules["module_top"]["display"]["album"]["pics"]) + except Exception: + pass + for paragraph in modules['module_content']['paragraphs']: if "pic" not in paragraph: continue diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index f60ea15..f8fef93 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -204,6 +204,8 @@ class BlueskyUserExtractor(BlueskyExtractor): def items(self): base = "{}/profile/{}/".format(self.root, self.user) + default = ("posts" if self.config("quoted", False) or + self.config("reposts", False) else "media") return self._dispatch_extractors(( (BlueskyInfoExtractor , base + "info"), (BlueskyAvatarExtractor , base + "avatar"), @@ -212,7 +214,7 @@ class BlueskyUserExtractor(BlueskyExtractor): (BlueskyRepliesExtractor , base + "replies"), (BlueskyMediaExtractor , base + "media"), (BlueskyLikesExtractor , base + "likes"), - ), ("media",)) + ), (default,)) class BlueskyPostsExtractor(BlueskyExtractor): diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py index 0524239..6a43224 100644 --- a/gallery_dl/extractor/cohost.py +++ b/gallery_dl/extractor/cohost.py @@ -19,7 +19,7 @@ class CohostExtractor(Extractor): category = "cohost" root = "https://cohost.org" directory_fmt = ("{category}", "{postingProject[handle]}") - filename_fmt = ("{postId}_{headline:?/_/[b:200]}{num}.{extension}") + filename_fmt = ("{postId}{headline:?_//[b:200]}{num:?_//}.{extension}") archive_fmt = "{postId}_{num}" def _init(self): @@ -28,6 +28,14 @@ class CohostExtractor(Extractor): self.shares = self.config("shares", False) self.asks = self.config("asks", True) + self.avatar = self.config("avatar", False) + if self.avatar: + self._urls_avatar = {None, ""} + + self.background = self.config("background", False) + if self.background: + self._urls_background = {None, ""} + def items(self): for post in self.posts(): reason = post.get("limitedVisibilityReason") @@ -43,6 +51,26 @@ class CohostExtractor(Extractor): post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") yield Message.Directory, post + + project = post["postingProject"] + if self.avatar: + url = project.get("avatarURL") + if url not in self._urls_avatar: + self._urls_avatar.add(url) + p = post.copy() + p["postId"] = p["kind"] = "avatar" + p["headline"] = p["num"] = "" + yield Message.Url, url, text.nameext_from_url(url, p) + + if self.background: + url = project.get("headerURL") + if url not in self._urls_background: + self._urls_background.add(url) + p = post.copy() + p["postId"] = p["kind"] = "background" + p["headline"] = p["num"] = "" + yield Message.Url, url, text.nameext_from_url(url, p) + for post["num"], file in enumerate(files, 1): url = file["fileURL"] post.update(file) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5f9d355..5ada030 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -42,8 +42,7 @@ class Extractor(): ciphers = None tls12 = True browser = None - useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:128.0) Gecko/20100101 Firefox/128.0") + useragent = util.USERAGENT_FIREFOX request_interval = 0.0 request_interval_min = 0.0 request_interval_429 = 60.0 @@ -172,8 +171,16 @@ class Extractor(): while True: try: response = session.request(method, url, **kwargs) - except (requests.exceptions.ConnectionError, - requests.exceptions.Timeout, + except requests.exceptions.ConnectionError as exc: + code = 0 + try: + reason = exc.args[0].reason + cls = reason.__class__.__name__ + pre, _, err = str(reason.args[-1]).partition(":") + msg = " {}: {}".format(cls, (err or pre).lstrip()) + except Exception: + msg = exc + except (requests.exceptions.Timeout, requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc @@ -212,6 +219,11 @@ class Extractor(): if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break + elif server and server.startswith("ddos-guard") and \ + code == 403: + if b"/ddos-guard/js-challenge/" in response.content: + self.log.warning("DDoS-Guard challenge") + break if code == 429 and self._handle_429(response): continue @@ -909,10 +921,11 @@ def _browser_useragent(): server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("127.0.0.1", 6414)) + server.bind(("127.0.0.1", 0)) server.listen(1) - webbrowser.open("http://127.0.0.1:6414/user-agent") + host, port = server.getsockname() + webbrowser.open("http://{}:{}/user-agent".format(host, port)) client = server.accept()[0] server.close() diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index a514696..e150829 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -10,12 +10,15 @@ from . import lolisafe from .common import Message from .. import text +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)" + class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): + """Extractor for cyberdrop albums""" category = "cyberdrop" root = "https://cyberdrop.me" root_api = "https://api.cyberdrop.me" - pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" + pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" def items(self): @@ -40,7 +43,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): extr('id="title"', "") album = { - "album_id" : self.album_id, + "album_id" : album_id, "album_name" : text.unescape(extr('title="', '"')), "album_size" : text.parse_bytes(extr( '

', "B")), @@ -67,3 +70,20 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): continue yield file + + +class CyberdropMediaExtractor(CyberdropAlbumExtractor): + """Extractor for cyberdrop media links""" + subcategory = "media" + directory_fmt = ("{category}",) + pattern = BASE_PATTERN + r"/f/([^/?#]+)" + example = "https://cyberdrop.me/f/ID" + + def fetch_album(self, album_id): + return self._extract_files((album_id,)), { + "album_id" : "", + "album_name" : "", + "album_size" : -1, + "description": "", + "count" : 1, + } diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ea3f13d..69934b4 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -451,6 +451,26 @@ class DeviantartExtractor(Extractor): elif type == "text": self._tiptap_process_text(html, content) + elif type == "heading": + attrs = content["attrs"] + level = str(attrs.get("level") or "3") + + html.append("') + html.append('') + + children = content.get("content") + if children: + for block in children: + self._tiptap_process_content(html, block) + + html.append("") + elif type == "hardBreak": html.append("

") @@ -478,8 +498,9 @@ class DeviantartExtractor(Extractor): for mark in marks: type = mark["type"] if type == "link": + attrs = mark.get("attrs") or {} html.append('') close.append("") elif type == "bold": @@ -491,6 +512,9 @@ class DeviantartExtractor(Extractor): elif type == "underline": html.append("") close.append("") + elif type == "strike": + html.append("") + close.append("") elif type == "textStyle" and len(mark) <= 1: pass else: diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index 04acfc5..2f3fdbf 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -40,7 +40,8 @@ class FacebookExtractor(Extractor): @staticmethod def decode_all(txt): return text.unescape( - txt.encode("utf-8").decode("unicode_escape") + txt.encode().decode("unicode_escape") + .encode("utf_16", "surrogatepass").decode("utf_16") ).replace("\\/", "/") @staticmethod diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e6b6b14..8c5b180 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -163,21 +163,14 @@ class InstagramExtractor(Extractor): "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), } - if "title" in post: data["highlight_title"] = post["title"] - if "created_at" in post: - data["post_date"] = data["date"] = text.parse_timestamp( - post.get("created_at")) else: # regular image/video post - date = text.parse_timestamp(post.get("taken_at")) data = { "post_id" : post["pk"], "post_shortcode": post["code"], "post_url": "{}/p/{}/".format(self.root, post["code"]), - "post_date": date, - "date": date, "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), "liked": post.get("has_liked", False), @@ -218,7 +211,8 @@ class InstagramExtractor(Extractor): data["owner_id"] = owner["pk"] data["username"] = owner.get("username") data["fullname"] = owner.get("full_name") - + data["post_date"] = data["date"] = text.parse_timestamp( + post.get("taken_at") or post.get("created_at") or post.get("seen")) data["_files"] = files = [] for num, item in enumerate(items, 1): diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 1aef66e..7f941bb 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -78,6 +78,16 @@ class ItakuImageExtractor(ItakuExtractor): return (self.api.image(self.item),) +class ItakuSearchExtractor(ItakuExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/home/images/?\?([^#]+)" + example = "https://itaku.ee/home/images?tags=SEARCH" + + def posts(self): + params = text.parse_query_list(self.item) + return self.api.search_images(params) + + class ItakuAPI(): def __init__(self, extractor): @@ -87,6 +97,42 @@ class ItakuAPI(): "Accept": "application/json, text/plain, */*", } + def search_images(self, params): + endpoint = "/galleries/images/" + required_tags = [] + negative_tags = [] + optional_tags = [] + + tags = params.pop("tags", None) + if not tags: + tags = () + elif isinstance(tags, str): + tags = (tags,) + + for tag in tags: + if not tag: + pass + elif tag[0] == "-": + negative_tags.append(tag[1:]) + elif tag[0] == "~": + optional_tags.append(tag[1:]) + else: + required_tags.append(tag) + + api_params = { + "required_tags": required_tags, + "negative_tags": negative_tags, + "optional_tags": optional_tags, + "date_range": "", + "maturity_rating": ("SFW", "Questionable", "NSFW"), + "ordering" : "-date_added", + "page" : "1", + "page_size" : "30", + "visibility": ("PUBLIC", "PROFILE_ONLY"), + } + api_params.update(params) + return self._pagination(endpoint, api_params, self.image) + def galleries_images(self, username, section=None): endpoint = "/galleries/images/" params = { diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 16c5b99..a7caca9 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -433,8 +433,8 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): """Extractor for kemono.su favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites()()(?:/?\?([^#]+))?" - example = "https://kemono.su/favorites" + pattern = BASE_PATTERN + r"/(?:account/)?favorites()()(?:/?\?([^#]+))?" + example = "https://kemono.su/account/favorites/artists" def items(self): self._prepare_ddosguard_cookies() diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py new file mode 100644 index 0000000..412b6b9 --- /dev/null +++ b/gallery_dl/extractor/lofter.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.lofter.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + + +class LofterExtractor(Extractor): + """Base class for lofter extractors""" + category = "lofter" + root = "https://www.lofter.com" + directory_fmt = ("{category}", "{blog_name}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + + def _init(self): + self.api = LofterAPI(self) + + def items(self): + for post in self.posts(): + if "post" in post: + post = post["post"] + + post["blog_name"] = post["blogInfo"]["blogName"] + post["date"] = text.parse_timestamp(post["publishTime"] // 1000) + post_type = post["type"] + + # Article + if post_type == 1: + content = post["content"] + image_urls = text.extract_iter(content, ' + r"www\.lofter\.com/front/blog/home-page/([\w-]+)|" + # https://.lofter.com/ + r"([\w-]+)\.lofter\.com" + r")/?(?:$|\?|#)") + example = "https://BLOG.lofter.com/" + + def posts(self): + blog_name = self.groups[0] or self.groups[1] + return self.api.blog_posts(blog_name) + + +class LofterAPI(): + + def __init__(self, extractor): + self.extractor = extractor + + def blog_posts(self, blog_name): + endpoint = "/v2.0/blogHomePage.api" + params = { + "method": "getPostLists", + "offset": 0, + "limit": 200, + "blogdomain": blog_name + ".lofter.com", + } + return self._pagination(endpoint, params) + + def post(self, blog_id, post_id): + endpoint = "/oldapi/post/detail.api" + params = { + "targetblogid": blog_id, + "postid": post_id, + } + return self._call(endpoint, params)["posts"][0] + + def _call(self, endpoint, data): + url = "https://api.lofter.com" + endpoint + params = { + 'product': 'lofter-android-7.9.10' + } + response = self.extractor.request( + url, method="POST", params=params, data=data) + info = response.json() + + if info["meta"]["status"] != 200: + self.extractor.log.debug("Server response: %s", info) + raise exception.StopExtraction("API request failed") + + return info["response"] + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + posts = data["posts"] + + yield from posts + + if params["offset"] + len(posts) < data["offset"]: + break + params["offset"] = data["offset"] diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index 4156484..1883bbc 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -9,6 +9,7 @@ """Recursive extractor""" from .common import Extractor, Message +from .. import text import re @@ -25,7 +26,7 @@ class RecursiveExtractor(Extractor): with open(url[7:]) as fp: page = fp.read() else: - page = self.request(url).text + page = self.request(text.ensure_http_scheme(url)).text for match in re.finditer(r"https?://[^\s\"']+", page): yield Message.Queue, match.group(0), {} diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py index 784cdc0..1c62d75 100644 --- a/gallery_dl/extractor/saint.py +++ b/gallery_dl/extractor/saint.py @@ -11,7 +11,7 @@ from .lolisafe import LolisafeAlbumExtractor from .. import text -BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|to)" +BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|cr|to)" class SaintAlbumExtractor(LolisafeAlbumExtractor): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 167953d..e756385 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -30,44 +30,6 @@ class TapasExtractor(Extractor): if self._cache is None: TapasExtractor._cache = {} - def items(self): - self.login() - headers = {"Accept": "application/json, text/javascript, */*;"} - - for episode_id in self.episode_ids(): - url = "{}/episode/{}".format(self.root, episode_id) - data = self.request(url, headers=headers).json()["data"] - - episode = data["episode"] - if not episode.get("free") and not episode.get("unlocked"): - raise exception.StopExtraction( - "Episode '%s' not unlocked (ID %s) ", - episode["title"], episode_id) - - html = data["html"] - series_id = text.rextract(html, 'data-series-id="', '"')[0] - try: - episode["series"] = self._cache[series_id] - except KeyError: - url = "{}/series/{}".format(self.root, series_id) - episode["series"] = self._cache[series_id] = self.request( - url, headers=headers).json()["data"] - - episode["date"] = text.parse_datetime(episode["publish_date"]) - yield Message.Directory, episode - - if episode["book"]: - content, _ = text.extract( - html, '

', '
', '
") diff --git a/gallery_dl/extractor/yiffverse.py b/gallery_dl/extractor/yiffverse.py new file mode 100644 index 0000000..2b14341 --- /dev/null +++ b/gallery_dl/extractor/yiffverse.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://yiffverse.com/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?yiffverse\.com" + + +class YiffverseExtractor(BooruExtractor): + category = "yiffverse" + root = "https://yiffverse.com" + root_cdn = "https://furry34com.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 30 + + TAG_TYPES = { + None: "general", + 1 : "general", + 2 : "copyright", + 4 : "character", + 8 : "artist", + } + FORMATS = ( + ("100", "mov.mp4"), + ("101", "mov720.mp4"), + ("102", "mov480.mp4"), + ("10" , "pic.jpg"), + ) + + def _file_url(self, post): + files = post["files"] + for fmt, extension in self.FORMATS: + if fmt in files: + break + else: + fmt = next(iter(files)) + + post_id = post["id"] + root = self.root_cdn if files[fmt][0] else self.root + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( + root, post_id // 1000, post_id, post_id, extension) + post["format_id"] = fmt + post["format"] = extension.partition(".")[0] + + return url + + def _prepare(self, post): + post.pop("files", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["filename"], _, post["format"] = post["filename"].rpartition(".") + if "tags" in post: + post["tags"] = [t["value"] for t in post["tags"]] + + def _tags(self, post, _): + if "tags" not in post: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tags"]: + tags[tag["type"]].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/v2/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["sortOrder"] = 1 + params["status"] = 2 + params["take"] = self.per_page + threshold = self.per_page + + while True: + data = self.request(url, method="POST", json=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["cursor"] = data.get("cursor") + + +class YiffversePostExtractor(YiffverseExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://yiffverse.com/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class YiffversePlaylistExtractor(YiffverseExtractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlist/(\d+)" + example = "https://yiffverse.com/playlist/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/v2/post/search/playlist/" + self.groups[0] + return self._pagination(endpoint) + + +class YiffverseTagExtractor(YiffverseExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/(?:tag/([^/?#]+))?(?:/?\?([^#]+))?(?:$|#)" + example = "https://yiffverse.com/tag/TAG" + + def _init(self): + tag, query = self.groups + params = text.parse_query(query) + + self.tags = tags = [] + if tag: + tags.append(text.unquote(tag)) + if "tags" in params: + tags.extend(params["tags"].split("|")) + + type = params.get("type") + if type == "video": + self.type = 1 + elif type == "image": + self.type = 0 + else: + self.type = None + + def metadata(self): + return {"search_tags": " ".join(self.tags)} + + def posts(self): + endpoint = "/v2/post/search/root" + params = {"includeTags": [t.replace("_", " ") for t in self.tags]} + if self.type is not None: + params["type"] = self.type + return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 4c4fb3a..bc135ad 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -78,8 +78,8 @@ class ZerochanExtractor(BooruExtractor): 'class="breadcrumbs', ''))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('
    '), - "source" : text.unescape(text.extr( - extr('id="source-url"', ''), 'href="', '"')), + "source" : text.unescape(text.remove_html(extr( + 'id="source-url"', '

    ').rpartition("")[2])), } html = data["tags"] @@ -93,14 +93,12 @@ class ZerochanExtractor(BooruExtractor): def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - text = self.request(url).text + txt = self.request(url).text try: - item = util.json_loads(text) - except ValueError as exc: - if " control character " not in str(exc): - raise - text = re.sub(r"[\x00-\x1f\x7f]", "", text) - item = util.json_loads(text) + item = util.json_loads(txt) + except ValueError: + item = self._parse_json(txt) + item["id"] = text.parse_int(entry_id) data = { "id" : item["id"], @@ -118,6 +116,27 @@ class ZerochanExtractor(BooruExtractor): return data + def _parse_json(self, txt): + txt = re.sub(r"[\x00-\x1f\x7f]", "", txt) + main, _, tags = txt.partition('tags": [') + + item = {} + for line in main.split(', "')[1:]: + key, _, value = line.partition('": ') + if value: + if value[0] == '"': + value = value[1:-1] + else: + value = text.parse_int(value) + if key: + item[key] = value + + item["tags"] = tags = tags[5:].split('", "') + if tags: + tags[-1] = tags[-1][:-5] + + return item + def _tags(self, post, page): tags = collections.defaultdict(list) for tag in post["tags"]: diff --git a/gallery_dl/job.py b/gallery_dl/job.py index c41f382..2914927 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -59,6 +59,7 @@ class Job(): for category in parents: cat = "{}>{}".format(category, extr.category) cfgpath.append((cat, extr.subcategory)) + cfgpath.append((category + ">*", extr.subcategory)) cfgpath.append((extr.category, extr.subcategory)) self.parents = parents else: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 3cbe510..72ec98e 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -647,13 +647,19 @@ class CustomNone(): __repr__ = __str__ +# v128.0 release on 2024-07-09 has ordinal 739076 +# 735492 == 739076 - 128 * 28 +_ff_ver = (datetime.date.today().toordinal() - 735492) // 28 + NONE = CustomNone() EPOCH = datetime.datetime(1970, 1, 1) SECOND = datetime.timedelta(0, 1) WINDOWS = (os.name == "nt") SENTINEL = object() -USERAGENT = "gallery-dl/" + version.__version__ EXECUTABLE = getattr(sys, "frozen", False) +USERAGENT = "gallery-dl/" + version.__version__ +USERAGENT_FIREFOX = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{}.0) " + "Gecko/20100101 Firefox/{}.0").format(_ff_ver, _ff_ver) SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"} GLOBALS = { "contains" : contains, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 2dab0d6..651745a 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.28.1" +__version__ = "1.28.2" __variant__ = None -- cgit v1.2.3