diff options
| author | 2025-10-14 00:23:10 -0400 | |
|---|---|---|
| committer | 2025-10-14 00:23:10 -0400 | |
| commit | 33f8a8a37a9cba738ef25fb99955f0730da9eb48 (patch) | |
| tree | b51fb48b160f5e5e034e6b4542e6f00703bae7ec /gallery_dl | |
| parent | bbe7fac03d881662a458e7fbf870c9d71f5257f4 (diff) | |
New upstream version 1.30.10.upstream/1.30.10
Diffstat (limited to 'gallery_dl')
27 files changed, 420 insertions, 349 deletions
diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 6c19e23..ba719ac 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -241,7 +241,7 @@ def _firefox_cookies_database(browser_name, profile=None, container=None): os.path.dirname(path), "containers.json") try: - with open(containers_path) as fp: + with open(containers_path, encoding="utf-8") as fp: identities = util.json_loads(fp.read())["identities"] except OSError: _log_error("Unable to read Firefox container database at '%s'", diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index a3df634..c7e33c8 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -47,6 +47,7 @@ modules = [ "cyberdrop", "cyberfile", "danbooru", + "dandadan", "dankefuerslesen", "desktopography", "deviantart", diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py index 5dcb6a5..ce50a91 100644 --- a/gallery_dl/extractor/bellazon.py +++ b/gallery_dl/extractor/bellazon.py @@ -27,7 +27,7 @@ class BellazonExtractor(Extractor): native = (f"{self.root}/", f"{self.root[6:]}/") extract_urls = text.re( r'(?s)<(' - r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>' + r'(?:video .*?<source [^>]*?src|a [^>]*?href)="([^"]+).*?</a>' r'|img [^>]*?src="([^"]+)"[^>]*>' r')' ).findall @@ -52,7 +52,11 @@ class BellazonExtractor(Extractor): url = text.unescape(url or url_img) if url.startswith(native): - if "/uploads/emoticons/" in url or "/profile/" in url: + if ( + "/uploads/emoticons/" in url or + "/profile/" in url or + "/topic/" in url + ): continue data["num"] += 1 if not (alt := text.extr(info, ' alt="', '"')) or ( diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index e2c5334..e8c5707 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -44,6 +44,8 @@ class BlueskyExtractor(Extractor): for post in self.posts(): if "post" in post: post = post["post"] + elif "item" in post: + post = post["item"] if self._user_did and post["author"]["did"] != self._user_did: self.log.debug("Skipping %s (repost)", self._pid(post)) continue @@ -148,9 +150,15 @@ class BlueskyExtractor(Extractor): if "images" in media: for image in media["images"]: - files.append(self._extract_media(image, "image")) + try: + files.append(self._extract_media(image, "image")) + except Exception: + pass if "video" in media and self.videos: - files.append(self._extract_media(media, "video")) + try: + files.append(self._extract_media(media, "video")) + except Exception: + pass post["count"] = len(files) return files @@ -372,6 +380,15 @@ class BlueskyHashtagExtractor(BlueskyExtractor): return self.api.search_posts("#"+hashtag, order) +class BlueskyBookmarkExtractor(BlueskyExtractor): + subcategory = "bookmark" + pattern = BASE_PATTERN + r"/saved" + example = "https://bsky.app/saved" + + def posts(self): + return self.api.get_bookmarks() + + class BlueskyAPI(): """Interface for the Bluesky API @@ -407,6 +424,10 @@ class BlueskyAPI(): } return self._pagination(endpoint, params) + def get_bookmarks(self): + endpoint = "app.bsky.bookmark.getBookmarks" + return self._pagination(endpoint, {}, "bookmarks", check_empty=True) + def get_feed(self, actor, feed): endpoint = "app.bsky.feed.getFeed" uri = (f"at://{self._did_from_actor(actor)}" diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index d5cf996..26ee3fd 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -807,7 +807,8 @@ class CivitaiTrpcAPI(): }) params = self._type_params(params) - return self._pagination(endpoint, params, meta) + return self._pagination(endpoint, params, meta, + user=("username" in params)) def collection(self, collection_id): endpoint = "collection.getById" @@ -854,13 +855,17 @@ class CivitaiTrpcAPI(): return self.extractor.request_json( url, params=params, headers=headers)["result"]["data"]["json"] - def _pagination(self, endpoint, params, meta=None): + def _pagination(self, endpoint, params, meta=None, user=False): if "cursor" not in params: params["cursor"] = None meta_ = {"cursor": ("undefined",)} + data = self._call(endpoint, params, meta_) + if user and data["items"] and \ + data["items"][0]["user"]["username"] != params["username"]: + return () + while True: - data = self._call(endpoint, params, meta_) yield from data["items"] try: @@ -871,6 +876,7 @@ class CivitaiTrpcAPI(): params["cursor"] = data["nextCursor"] meta_ = meta + data = self._call(endpoint, params, meta_) def _merge_params(self, params_user, params_default): """Combine 'params_user' with 'params_default'""" diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 01965f3..34e65c5 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -228,7 +228,8 @@ class Extractor(): break finally: - Extractor.request_timestamp = time.time() + if interval: + Extractor.request_timestamp = time.time() self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: @@ -262,6 +263,7 @@ class Extractor(): def request_location(self, url, **kwargs): kwargs.setdefault("method", "HEAD") kwargs.setdefault("allow_redirects", False) + kwargs.setdefault("interval", False) return self.request(url, **kwargs).headers.get("location", "") def request_json(self, url, **kwargs): @@ -539,7 +541,7 @@ class Extractor(): elif isinstance(cookies_source, str): path = util.expand_path(cookies_source) try: - with open(path) as fp: + with open(path, encoding="utf-8") as fp: cookies = util.cookiestxt_load(fp) except ValueError as exc: self.log.warning("cookies: Invalid Netscape cookies.txt file " @@ -597,7 +599,7 @@ class Extractor(): path_tmp = path + ".tmp" try: - with open(path_tmp, "w") as fp: + with open(path_tmp, "w", encoding="utf-8") as fp: util.cookiestxt_store(fp, self.cookies) os.replace(path_tmp, path) except OSError as exc: diff --git a/gallery_dl/extractor/dandadan.py b/gallery_dl/extractor/dandadan.py new file mode 100644 index 0000000..48dc0b7 --- /dev/null +++ b/gallery_dl/extractor/dandadan.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://dandadan.net/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?dandadan\.net" + + +class DandadanBase(): + """Base class for dandadan extractors""" + category = "dandadan" + root = "https://dandadan.net" + + +class DandadanChapterExtractor(DandadanBase, ChapterExtractor): + """Extractor for dandadan manga chapters""" + pattern = rf"{BASE_PATTERN}(/manga/dandadan-chapter-([^/?#]+)/?)" + example = "https://dandadan.net/manga/dandadan-chapter-123/" + + def metadata(self, page): + chapter, sep, minor = text.extr( + page, "hapter ", " - ").partition(".") + return { + "manga" : "Dandadan", + "chapter" : text.parse_int(chapter), + "chapter_minor": f"{sep}{minor}", + "lang" : "en", + } + + def images(self, page): + images = [ + (text.extr(figure, 'src="', '"'), None) + for figure in text.extract_iter(page, "<figure", "</figure>") + ] + + if images: + return images + + return [ + (src, None) + for src in text.extract_iter( + page, '<img decoding="async" class="aligncenter" src="', '"') + ] + + +class DandadanMangaExtractor(DandadanBase, MangaExtractor): + """Extractor for dandadan manga""" + chapterclass = DandadanChapterExtractor + pattern = rf"{BASE_PATTERN}(/)" + example = "https://dandadan.net/" + + def chapters(self, page): + data = {} + return [ + (text.extr(post, 'href="', '"'), data) + for post in text.extract_iter(page, '<li id="su-post', "</li>") + ] diff --git a/gallery_dl/extractor/fansly.py b/gallery_dl/extractor/fansly.py index 8a6dbef..7138599 100644 --- a/gallery_dl/extractor/fansly.py +++ b/gallery_dl/extractor/fansly.py @@ -9,7 +9,7 @@ """Extractors for https://fansly.com/""" from .common import Extractor, Message -from .. import text, util +from .. import text, util, exception import time BASE_PATTERN = r"(?:https?://)?(?:www\.)?fansly\.com" @@ -43,6 +43,23 @@ class FanslyExtractor(Extractor): url = file["url"] yield Message.Url, url, text.nameext_from_url(url, post) + def posts(self): + creator, wall_id = self.groups + account = self.api.account(creator) + walls = account["walls"] + + if wall_id: + for wall in walls: + if wall["id"] == wall_id: + break + else: + raise exception.NotFoundError("wall") + walls = (wall,) + + for wall in walls: + self.kwdict["wall"] = wall + yield from self.posts_wall(account, wall) + def _extract_files(self, post): files = [] @@ -77,7 +94,7 @@ class FanslyExtractor(Extractor): variants.append(media) formats = [ - (type > 256, variant["width"], type, variant) + (variant["width"], (type-500 if type > 256 else type), variant) for variant in variants if variant.get("locations") and (type := variant["type"]) in self.formats @@ -190,11 +207,8 @@ class FanslyCreatorPostsExtractor(FanslyExtractor): pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts(?:/wall/(\d+))?" example = "https://fansly.com/CREATOR/posts" - def posts(self): - creator, wall_id = self.groups - account = self.api.account(creator) - return self.api.timeline_new( - account["id"], wall_id or account["walls"][0]["id"]) + def posts_wall(self, account, wall): + return self.api.timeline_new(account["id"], wall["id"]) class FanslyCreatorMediaExtractor(FanslyExtractor): @@ -202,11 +216,8 @@ class FanslyCreatorMediaExtractor(FanslyExtractor): pattern = rf"{BASE_PATTERN}/([^/?#]+)/media(?:/wall/(\d+))?" example = "https://fansly.com/CREATOR/media" - def posts(self): - creator, wall_id = self.groups - account = self.api.account(creator) - return self.api.mediaoffers_location( - account["id"], wall_id or account["walls"][0]["id"]) + def posts_wall(self, account, wall): + return self.api.mediaoffers_location(account["id"], wall["id"]) class FanslyAPI(): diff --git a/gallery_dl/extractor/girlsreleased.py b/gallery_dl/extractor/girlsreleased.py index 4fc77c6..5e68a63 100644 --- a/gallery_dl/extractor/girlsreleased.py +++ b/gallery_dl/extractor/girlsreleased.py @@ -27,7 +27,7 @@ class GirlsreleasedExtractor(Extractor): yield Message.Queue, f"{base}{set[0]}", data def _pagination(self): - base = f"{self.root}/api/0.1/sets/{self._path}/{self.groups[0]}/page/" + base = f"{self.root}/api/0.2/sets/{self._path}/{self.groups[0]}/page/" for pnum in itertools.count(): sets = self.request_json(f"{base}{pnum}")["sets"] if not sets: @@ -45,7 +45,7 @@ class GirlsreleasedSetExtractor(GirlsreleasedExtractor): example = "https://girlsreleased.com/set/12345" def items(self): - url = f"{self.root}/api/0.1/set/{self.groups[0]}" + url = f"{self.root}/api/0.2/set/{self.groups[0]}" json = self.request_json(url)["set"] data = { "title": json["name"] or json["id"], diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 0e6c480..71964e9 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -100,7 +100,7 @@ class InstagramExtractor(Extractor): if videos: file["_http_headers"] = videos_headers text.nameext_from_url(url, file) - if videos_dash and "_ytdl_manifest_data" in post: + if videos_dash and "_ytdl_manifest_data" in file: file["_fallback"] = (url,) file["_ytdl_manifest"] = "dash" url = f"ytdl:{post['post_url']}{file['num']}.mp4" @@ -293,6 +293,8 @@ class InstagramExtractor(Extractor): self._extract_tagged_users(item, media) files.append(media) + if "subscription_media_visibility" in post: + data["subscription"] = post["subscription_media_visibility"] if "type" not in data: if len(files) == 1 and files[0]["video_url"]: data["type"] = "reel" @@ -436,7 +438,8 @@ class InstagramExtractor(Extractor): return cursor def _update_cursor(self, cursor): - self.log.debug("Cursor: %s", cursor) + if cursor: + self.log.debug("Cursor: %s", cursor) self._cursor = cursor return cursor @@ -461,16 +464,18 @@ class InstagramPostExtractor(InstagramExtractor): """Extractor for an Instagram post""" subcategory = "post" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?:share/()|[^/?#]+/)?(?:p|tv|reels?())/([^/?#]+)") + r"/(?:share()(?:/(?:p|tv|reels?()))?" + r"|(?:[^/?#]+/)?(?:p|tv|reels?()))" + r"/([^/?#]+)") example = "https://www.instagram.com/p/abcdefg/" def __init__(self, match): - if match[2] is not None: + if match[2] is not None or match[3] is not None: self.subcategory = "reel" InstagramExtractor.__init__(self, match) def posts(self): - share, reel, shortcode = self.groups + share, _, _, shortcode = self.groups if share is not None: url = text.ensure_http_scheme(self.url) headers = { diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py index 1f70031..b4a8abc 100644 --- a/gallery_dl/extractor/kemono.py +++ b/gallery_dl/extractor/kemono.py @@ -93,8 +93,13 @@ class KemonoExtractor(Extractor): if creator_info is not None: key = f"{service}_{creator_id}" if key not in creator_info: - creator = creator_info[key] = self.api.creator_profile( - service, creator_id) + try: + creator = creator_info[key] = self.api.creator_profile( + service, creator_id) + except exception.HttpError: + self.log.warning("%s/%s/%s: 'Creator not found'", + service, creator_id, post["id"]) + creator = creator_info[key] = util.NONE else: creator = creator_info[key] diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 30d6848..16eb650 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -450,13 +450,13 @@ def _manga_info(self, uuid): return { "manga" : (mattr["title"].get("en") or - next(iter(mattr["title"].values()))), + next(iter(mattr["title"].values()), "")), "manga_id": manga["id"], "manga_titles": [t.popitem()[1] for t in mattr.get("altTitles") or ()], "manga_date" : text.parse_datetime(mattr.get("createdAt")), "description" : (mattr["description"].get("en") or - next(iter(mattr["description"].values()))), + next(iter(mattr["description"].values()), "")), "demographic": mattr.get("publicationDemographic"), "origin": mattr.get("originalLanguage"), "status": mattr.get("status"), diff --git a/gallery_dl/extractor/naverblog.py b/gallery_dl/extractor/naverblog.py index 302cb63..b55e001 100644 --- a/gallery_dl/extractor/naverblog.py +++ b/gallery_dl/extractor/naverblog.py @@ -88,11 +88,11 @@ class NaverBlogPostExtractor(NaverBlogBase, GalleryExtractor): files.append((url, None)) def _extract_videos(self, files, page): - for module in text.extract_iter(page, " data-module='", "'></"): + for module in text.extract_iter(page, " data-module='", "'"): if '"v2_video"' not in module: continue - media = util.json_loads(module)["data"] try: + media = util.json_loads(module)["data"] self._extract_media(files, media) except Exception as exc: self.log.warning("%s: Failed to extract video '%s' (%s: %s)", diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 957e316..32ca528 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2023 Mike Fährmann +# Copyright 2022-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,102 +21,172 @@ class PoipikuExtractor(Extractor): directory_fmt = ("{category}", "{user_id} {user_name}") filename_fmt = "{post_id}_{num}.{extension}" archive_fmt = "{post_id}_{num}" + cookies_domain = "poipiku.com" + cookies_warning = True request_interval = (0.5, 1.5) def _init(self): self.cookies.set( - "LANG", "en", domain="poipiku.com") + "LANG", "en", domain=self.cookies_domain) self.cookies.set( - "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com") + "POIPIKU_CONTENTS_VIEW_MODE", "1", domain=self.cookies_domain) + self.headers = { + "Accept" : "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer": None, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + } + self.password = self.config("password", "") def items(self): - password = self.config("password", "") + if self.cookies_check(("POIPIKU_LK",)): + extract_files = self._extract_files_auth + logged_in = True + else: + extract_files = self._extract_files_noauth + logged_in = False + if self.cookies_warning: + self.log.warning("no 'POIPIKU_LK' cookie set") + PoipikuExtractor.cookies_warning = False for post_url in self.posts(): - parts = post_url.split("/") if post_url[0] == "/": - post_url = self.root + post_url + post_url = f"{self.root}{post_url}" page = self.request(post_url).text extr = text.extract_from(page) - + parts = post_url.rsplit("/", 2) post = { "post_category": extr("<title>[", "]"), "count" : text.parse_int(extr("(", " ")), - "post_id" : parts[-1].partition(".")[0], - "user_id" : parts[-2], + "post_id" : parts[2].partition(".")[0], + "user_id" : parts[1], "user_name" : text.unescape(extr( '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]), "description": text.unescape(extr( 'class="IllustItemDesc" >', '</h1>')), + "warning" : False, + "password" : False, + "requires" : None, + "original" : logged_in, "_http_headers": {"Referer": post_url}, } + thumb = self._extract_thumb(post, extr) + self.headers["Referer"] = post_url + + if post["requires"] and not post["password"] and extr( + "PasswordIcon", ">"): + post["password"] = True + yield Message.Directory, post - post["num"] = warning = 0 - - while True: - thumb = extr('class="IllustItemThumbImg" src="', '"') - if not thumb: - break - elif thumb.startswith(("//img.poipiku.com/img/", "/img/")): - if "/warning" in thumb: - warning = True - self.log.debug("%s: %s", post["post_id"], thumb) - continue - post["num"] += 1 - url = text.ensure_http_scheme(thumb[:-8]).replace( - "//img.", "//img-org.", 1) + for post["num"], url in enumerate(extract_files( + post, thumb, extr), 1): yield Message.Url, url, text.nameext_from_url(url, post) - if not warning and not extr('ShowAppendFile', '<'): - continue + def _extract_thumb(self, post, extr): + thumb = "" - url = self.root + "/f/ShowAppendFileF.jsp" - headers = { - "Accept" : "application/json, text/javascript, */*; q=0.01", - "X-Requested-With": "XMLHttpRequest", - "Origin" : self.root, - "Referer": post_url, - } - data = { - "UID": post["user_id"], - "IID": post["post_id"], - "PAS": password, - "MD" : "0", - "TWF": "-1", - } - resp = self.request_json( - url, method="POST", headers=headers, data=data) - - page = resp["html"] - if (resp.get("result_num") or 0) < 0: - self.log.warning("%s: '%s'", - post["post_id"], page.replace("<br/>", " ")) - - for thumb in text.extract_iter( - page, 'class="IllustItemThumbImg" src="', '"'): - post["num"] += 1 - url = text.ensure_http_scheme(thumb[:-8]).replace( - "//img.", "//img-org.", 1) - yield Message.Url, url, text.nameext_from_url(url, post) + while True: + img = extr('class="IllustItemThumbImg" src="', '"') + + if not img: + return thumb + elif img.startswith("https://cdn.poipiku.com/img/"): + self.log.debug("%s: %s", post["post_id"], img) + type = text.rextr(img, "/", ".") + if type == "warning": + post["warning"] = True + elif type == "publish_pass": + post["password"] = True + elif type == "publish_login": + post["requires"] = "login" + elif type == "publish_follower": + post["requires"] = "follow" + elif type == "publish_t_rt": + post["requires"] = "retweet" + elif img.startswith(( + "https://img.poipiku.com/img/", + "//img.poipiku.com/img/", + "/img/", + )): + self.log.debug("%s: %s", post["post_id"], img) + if "/warning" in img: + post["warning"] = True + else: + thumb = img + + def _extract_files_auth(self, post, thumb, extr): + data = self._show_illust_detail(post) + + if data.get("error_code"): + data = self._show_append_file(post) + html = data["html"] + self.log.warning("%s: '%s'", + post["post_id"], html.replace("<br/>", " ")) + return () + return text.extract_iter(data["html"], 'src="', '"') + + def _extract_files_noauth(self, post, thumb, extr): + if thumb: + if not extr('ShowAppendFile', '<'): + return (thumb,) + files = [thumb] + else: + files = [] + + data = self._show_append_file(post) + html = data["html"] + if (data.get("result_num") or 0) < 0: + self.log.warning("%s: '%s'", + post["post_id"], html.replace("<br/>", " ")) + + files.extend(text.extract_iter( + html, 'class="IllustItemThumbImg" src="', '"')) + return files + + def _show_illust_detail(self, post): + url = f"{self.root}/f/ShowIllustDetailF.jsp" + data = { + "ID" : post["user_id"], + "TD" : post["post_id"], + "AD" : "-1", + "PAS": self.password, + } + return self.request_json( + url, method="POST", headers=self.headers, data=data, + interval=False) + + def _show_append_file(self, post): + url = f"{self.root}/f/ShowAppendFileF.jsp" + data = { + "UID": post["user_id"], + "IID": post["post_id"], + "PAS": self.password, + "MD" : "0", + "TWF": "-1", + } + return self.request_json( + url, method="POST", headers=self.headers, data=data, + interval=False) class PoipikuUserExtractor(PoipikuExtractor): """Extractor for posts from a poipiku user""" subcategory = "user" - pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?" - r"(\d+)/?(?:$|[?&#])") + pattern = (rf"{BASE_PATTERN}/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?" + rf"(\d+)/?(?:$|[?&#])") example = "https://poipiku.com/12345/" - def __init__(self, match): - PoipikuExtractor.__init__(self, match) - self._page, self.user_id = match.groups() - def posts(self): - url = self.root + "/IllustListPcV.jsp" + pnum, user_id = self.groups + + url = f"{self.root}/IllustListPcV.jsp" params = { - "PG" : text.parse_int(self._page, 0), - "ID" : self.user_id, + "PG" : text.parse_int(pnum, 0), + "ID" : user_id, "KWD": "", } @@ -137,12 +207,9 @@ class PoipikuUserExtractor(PoipikuExtractor): class PoipikuPostExtractor(PoipikuExtractor): """Extractor for a poipiku post""" subcategory = "post" - pattern = BASE_PATTERN + r"/(\d+)/(\d+)" + pattern = rf"{BASE_PATTERN}/(\d+)/(\d+)" example = "https://poipiku.com/12345/12345.html" - def __init__(self, match): - PoipikuExtractor.__init__(self, match) - self.user_id, self.post_id = match.groups() - def posts(self): - return (f"/{self.user_id}/{self.post_id}.html",) + user_id, post_id = self.groups + return (f"/{user_id}/{post_id}.html",) diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index 4762fa5..c553fec 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -22,7 +22,7 @@ class RecursiveExtractor(Extractor): url = self.url.partition(":")[2] if url.startswith("file://"): - with open(url[7:]) as fp: + with open(url[7:], encoding="utf-8") as fp: page = fp.read() else: page = self.request(text.ensure_http_scheme(url)).text diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index e7df4a3..bf125a6 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1745,18 +1745,7 @@ class TwitterAPI(): retry = True elif "Could not authenticate you" in msg: - if not self.extractor.config("relogin", True): - continue - - username, password = self.extractor._get_auth_info() - if not username: - continue - - _login_impl.invalidate(username) - self.extractor.cookies_update( - _login_impl(self.extractor, username, password)) - self.__init__(self.extractor) - retry = True + raise exception.AbortExtraction(f"'{msg}'") elif msg.lower().startswith("timeout"): retry = True @@ -2195,179 +2184,6 @@ class TwitterAPI(): @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): - - def process(data, params=None): - response = extr.request( - url, params=params, headers=headers, json=data, - method="POST", fatal=None) - - # update 'x-csrf-token' header (#5945) - if csrf_token := response.cookies.get("ct0"): - headers["x-csrf-token"] = csrf_token - - try: - data = response.json() - except ValueError: - data = {"errors": ({"message": "Invalid response"},)} - else: - if response.status_code < 400: - try: - return (data["flow_token"], - data["subtasks"][0]["subtask_id"]) - except LookupError: - pass - - errors = [] - for error in data.get("errors") or (): - msg = error.get("message") - errors.append(f'"{msg}"' if msg else "Unknown error") - extr.log.debug(response.text) - raise exception.AuthenticationError(", ".join(errors)) - - cookies = extr.cookies - cookies.clear() - api = TwitterAPI(extr) - api._authenticate_guest() - - url = "https://api.x.com/1.1/onboarding/task.json" - params = {"flow_name": "login"} - headers = api.headers - - extr.log.info("Logging in as %s", username) - - # init - data = { - "input_flow_data": { - "flow_context": { - "debug_overrides": {}, - "start_location": {"location": "unknown"}, - }, - }, - "subtask_versions": { - "action_list": 2, - "alert_dialog": 1, - "app_download_cta": 1, - "check_logged_in_account": 1, - "choice_selection": 3, - "contacts_live_sync_permission_prompt": 0, - "cta": 7, - "email_verification": 2, - "end_flow": 1, - "enter_date": 1, - "enter_email": 2, - "enter_password": 5, - "enter_phone": 2, - "enter_recaptcha": 1, - "enter_text": 5, - "enter_username": 2, - "generic_urt": 3, - "in_app_notification": 1, - "interest_picker": 3, - "js_instrumentation": 1, - "menu_dialog": 1, - "notifications_permission_prompt": 2, - "open_account": 2, - "open_home_timeline": 1, - "open_link": 1, - "phone_verification": 4, - "privacy_options": 1, - "security_key": 3, - "select_avatar": 4, - "select_banner": 2, - "settings_list": 7, - "show_code": 1, - "sign_up": 2, - "sign_up_review": 4, - "tweet_selection_urt": 1, - "update_users": 1, - "upload_media": 1, - "user_recommendations_list": 4, - "user_recommendations_urt": 1, - "wait_spinner": 3, - "web_modal": 1, - }, - } - - flow_token, subtask = process(data, params) - while not cookies.get("auth_token"): - if subtask == "LoginJsInstrumentationSubtask": - data = { - "js_instrumentation": { - "response": "{}", - "link": "next_link", - }, - } - elif subtask == "LoginEnterUserIdentifierSSO": - data = { - "settings_list": { - "setting_responses": [ - { - "key": "user_identifier", - "response_data": { - "text_data": {"result": username}, - }, - }, - ], - "link": "next_link", - }, - } - elif subtask == "LoginEnterPassword": - data = { - "enter_password": { - "password": password, - "link": "next_link", - }, - } - elif subtask == "LoginEnterAlternateIdentifierSubtask": - alt = extr.config("username-alt") or extr.input( - "Alternate Identifier (username, email, phone number): ") - data = { - "enter_text": { - "text": alt, - "link": "next_link", - }, - } - elif subtask == "LoginTwoFactorAuthChallenge": - data = { - "enter_text": { - "text": extr.input("2FA Token: "), - "link": "next_link", - }, - } - elif subtask == "LoginAcid": - data = { - "enter_text": { - "text": extr.input("Email Verification Code: "), - "link": "next_link", - }, - } - elif subtask == "AccountDuplicationCheck": - data = { - "check_logged_in_account": { - "link": "AccountDuplicationCheck_false", - }, - } - elif subtask == "ArkoseLogin": - raise exception.AuthenticationError("Login requires CAPTCHA") - elif subtask == "DenyLoginSubtask": - raise exception.AuthenticationError("Login rejected as suspicious") - elif subtask == "LoginSuccessSubtask": - raise exception.AuthenticationError( - "No 'auth_token' cookie received") - else: - raise exception.AbortExtraction(f"Unrecognized subtask {subtask}") - - inputs = {"subtask_id": subtask} - inputs.update(data) - data = { - "flow_token": flow_token, - "subtask_inputs": [inputs], - } - - extr.sleep(random.uniform(1.0, 3.0), f"login ({subtask})") - flow_token, subtask = process(data) - - return { - cookie.name: cookie.value - for cookie in extr.cookies - } + extr.log.error("Login with username & password is no longer supported. " + "Use browser cookies instead.") + return {} diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 49a94b5..79120c1 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -73,7 +73,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): else: comic_name = episode_name = "" - if extr('<span class="tx _btnOpenEpisodeList ', '"'): + if extr('<span class="tx _btnOpenEpisodeLis', '"'): episode = extr(">#", "<") else: episode = "" diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 07bed79..3c0f077 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -14,7 +14,7 @@ from ..cache import cache import random BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)" -USER_PATTERN = BASE_PATTERN + r"/(?:(u|n|p(?:rofile)?)/)?([^/?#]+)(?:/home)?" +USER_PATTERN = rf"{BASE_PATTERN}/(?:(u|n|p(?:rofile)?)/)?([^/?#]+)(?:/home)?" class WeiboExtractor(Extractor): @@ -22,6 +22,8 @@ class WeiboExtractor(Extractor): directory_fmt = ("{category}", "{user[screen_name]}") filename_fmt = "{status[id]}_{num:>02}.{extension}" archive_fmt = "{status[id]}_{num}" + cookies_domain = ".weibo.com" + cookies_names = ("SUB", "SUBP") root = "https://weibo.com" request_interval = (1.0, 2.0) @@ -38,8 +40,23 @@ class WeiboExtractor(Extractor): self.gifs_video = (self.gifs == "video") cookies = _cookie_cache() - if cookies is not None: - self.cookies.update(cookies) + if cookies is None: + self.logged_in = self.cookies_check( + self.cookies_names, self.cookies_domain) + return + + domain = self.cookies_domain + cookies = {c.name: c for c in cookies if c.domain == domain} + for cookie in self.cookies: + if cookie.domain == domain and cookie.name in cookies: + del cookies[cookie.name] + if not cookies: + self.logged_in = True + return + + self.logged_in = False + for cookie in cookies.values(): + self.cookies.set_cookie(cookie) def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -148,6 +165,10 @@ class WeiboExtractor(Extractor): self.log.debug("%s: Ignoring 'movie' video", status["id"]) def _extract_video(self, info): + if info.get("live_status") == 1: + self.log.debug("Skipping ongoing live stream") + return {"url": ""} + try: media = max(info["playback_list"], key=lambda m: m["meta"]["quality_index"]) @@ -182,7 +203,7 @@ class WeiboExtractor(Extractor): return self.request_json(url)["data"]["user"]["idstr"] def _pagination(self, endpoint, params): - url = self.root + "/ajax" + endpoint + url = f"{self.root}/ajax{endpoint}" headers = { "X-Requested-With": "XMLHttpRequest", "X-XSRF-TOKEN": None, @@ -201,8 +222,12 @@ class WeiboExtractor(Extractor): raise exception.AbortExtraction( f'"{data.get("msg") or "unknown error"}"') - data = data["data"] - statuses = data["list"] + try: + data = data["data"] + statuses = data["list"] + except KeyError: + return + yield from statuses # videos, newvideo @@ -215,6 +240,8 @@ class WeiboExtractor(Extractor): # album if since_id := data.get("since_id"): params["sinceid"] = since_id + if "page" in params: + params["page"] += 1 continue # home, article @@ -263,7 +290,7 @@ class WeiboExtractor(Extractor): class WeiboUserExtractor(WeiboExtractor): """Extractor for weibo user profiles""" subcategory = "user" - pattern = USER_PATTERN + r"(?:$|#)" + pattern = rf"{USER_PATTERN}(?:$|#)" example = "https://weibo.com/USER" # do NOT override 'initialize()' @@ -274,18 +301,18 @@ class WeiboUserExtractor(WeiboExtractor): def items(self): base = f"{self.root}/u/{self._user_id()}?tabtype=" return Dispatch._dispatch_extractors(self, ( - (WeiboHomeExtractor , base + "home"), - (WeiboFeedExtractor , base + "feed"), - (WeiboVideosExtractor , base + "video"), - (WeiboNewvideoExtractor, base + "newVideo"), - (WeiboAlbumExtractor , base + "album"), + (WeiboHomeExtractor , f"{base}home"), + (WeiboFeedExtractor , f"{base}feed"), + (WeiboVideosExtractor , f"{base}video"), + (WeiboNewvideoExtractor, f"{base}newVideo"), + (WeiboAlbumExtractor , f"{base}album"), ), ("feed",)) class WeiboHomeExtractor(WeiboExtractor): """Extractor for weibo 'home' listings""" subcategory = "home" - pattern = USER_PATTERN + r"\?tabtype=home" + pattern = rf"{USER_PATTERN}\?tabtype=home" example = "https://weibo.com/USER?tabtype=home" def statuses(self): @@ -297,19 +324,21 @@ class WeiboHomeExtractor(WeiboExtractor): class WeiboFeedExtractor(WeiboExtractor): """Extractor for weibo user feeds""" subcategory = "feed" - pattern = USER_PATTERN + r"\?tabtype=feed" + pattern = rf"{USER_PATTERN}\?tabtype=feed" example = "https://weibo.com/USER?tabtype=feed" def statuses(self): endpoint = "/statuses/mymblog" params = {"uid": self._user_id(), "feature": "0"} + if self.logged_in: + params["page"] = 1 return self._pagination(endpoint, params) class WeiboVideosExtractor(WeiboExtractor): """Extractor for weibo 'video' listings""" subcategory = "videos" - pattern = USER_PATTERN + r"\?tabtype=video" + pattern = rf"{USER_PATTERN}\?tabtype=video" example = "https://weibo.com/USER?tabtype=video" def statuses(self): @@ -323,7 +352,7 @@ class WeiboVideosExtractor(WeiboExtractor): class WeiboNewvideoExtractor(WeiboExtractor): """Extractor for weibo 'newVideo' listings""" subcategory = "newvideo" - pattern = USER_PATTERN + r"\?tabtype=newVideo" + pattern = rf"{USER_PATTERN}\?tabtype=newVideo" example = "https://weibo.com/USER?tabtype=newVideo" def statuses(self): @@ -335,7 +364,7 @@ class WeiboNewvideoExtractor(WeiboExtractor): class WeiboArticleExtractor(WeiboExtractor): """Extractor for weibo 'article' listings""" subcategory = "article" - pattern = USER_PATTERN + r"\?tabtype=article" + pattern = rf"{USER_PATTERN}\?tabtype=article" example = "https://weibo.com/USER?tabtype=article" def statuses(self): @@ -347,7 +376,7 @@ class WeiboArticleExtractor(WeiboExtractor): class WeiboAlbumExtractor(WeiboExtractor): """Extractor for weibo 'album' listings""" subcategory = "album" - pattern = USER_PATTERN + r"\?tabtype=album" + pattern = rf"{USER_PATTERN}\?tabtype=album" example = "https://weibo.com/USER?tabtype=album" def statuses(self): @@ -367,9 +396,9 @@ class WeiboAlbumExtractor(WeiboExtractor): class WeiboStatusExtractor(WeiboExtractor): - """Extractor for images from a status on weibo.cn""" + """Extractor for a weibo status""" subcategory = "status" - pattern = BASE_PATTERN + r"/(detail|status|\d+)/(\w+)" + pattern = rf"{BASE_PATTERN}/(detail|status|\d+)/(\w+)" example = "https://weibo.com/detail/12345" def statuses(self): diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 5ba47d2..ba020d5 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -74,9 +74,7 @@ class WikimediaExtractor(BaseExtractor): m["name"]: m["value"] for m in image["commonmetadata"] or ()} - filename = image["canonicaltitle"] - image["filename"], _, image["extension"] = \ - filename.partition(":")[2].rpartition(".") + text.nameext_from_url(image["canonicaltitle"].partition(":")[2], image) image["date"] = text.parse_datetime( image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") @@ -92,7 +90,16 @@ class WikimediaExtractor(BaseExtractor): self.prepare_info(info) yield Message.Directory, info - for info["num"], image in enumerate(images, 1): + num = 0 + for image in images: + # https://www.mediawiki.org/wiki/Release_notes/1.34 + if "filemissing" in image: + self.log.warning( + "File %s (or its revision) is missing", + image["canonicaltitle"].partition(":")[2]) + continue + num += 1 + image["num"] = num self.prepare_image(image) image.update(info) yield Message.Url, image["url"], image diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 98c9331..7bff83b 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -23,7 +23,7 @@ class ZerochanExtractor(BooruExtractor): filename_fmt = "{id}.{extension}" archive_fmt = "{id}" page_start = 1 - per_page = 250 + per_page = 200 cookies_domain = ".zerochan.net" cookies_names = ("z_id", "z_hash") useragent = util.USERAGENT @@ -188,10 +188,11 @@ class ZerochanTagExtractor(ZerochanExtractor): def posts_html(self): url = self.root + "/" + self.search_tag - params = text.parse_query(self.query) - params["p"] = text.parse_int(params.get("p"), self.page_start) metadata = self.config("metadata") + params = text.parse_query(self.query, empty=True) + params["p"] = text.parse_int(params.get("p"), self.page_start) + while True: try: page = self.request( @@ -231,11 +232,11 @@ class ZerochanTagExtractor(ZerochanExtractor): def posts_api(self): url = self.root + "/" + self.search_tag metadata = self.config("metadata") - params = { - "json": "1", - "l" : self.per_page, - "p" : self.page_start, - } + + params = text.parse_query(self.query, empty=True) + params["p"] = text.parse_int(params.get("p"), self.page_start) + params.setdefault("l", self.per_page) + params["json"] = "1" while True: try: diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index cc9af11..5246f66 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -40,7 +40,15 @@ def parse(format_string, default=NONE, fmt=format): else: cls = StringFormatter - formatter = _CACHE[key] = cls(format_string, default, fmt) + try: + formatter = _CACHE[key] = cls(format_string, default, fmt) + except Exception as exc: + import logging + logging.getLogger("formatter").error( + "Invalid format string '%s' (%s: %s)", + format_string, exc.__class__.__name__, exc) + raise + return formatter @@ -259,7 +267,7 @@ class TemplateFormatter(StringFormatter): """Read format_string from file""" def __init__(self, path, default=NONE, fmt=format): - with open(util.expand_path(path)) as fp: + with open(util.expand_path(path), encoding="utf-8") as fp: format_string = fp.read() StringFormatter.__init__(self, format_string, default, fmt) @@ -268,7 +276,7 @@ class TemplateFStringFormatter(FStringFormatter): """Read f-string from file""" def __init__(self, path, default=NONE, fmt=None): - with open(util.expand_path(path)) as fp: + with open(util.expand_path(path), encoding="utf-8") as fp: fstring = fp.read() FStringFormatter.__init__(self, fstring, default, fmt) @@ -277,7 +285,7 @@ class TemplateJinjaFormatter(JinjaFormatter): """Generate text by evaluating a Jinja template""" def __init__(self, path, default=NONE, fmt=None): - with open(util.expand_path(path)) as fp: + with open(util.expand_path(path), encoding="utf-8") as fp: source = fp.read() JinjaFormatter.__init__(self, source, default, fmt) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 6381622..763fb55 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -118,22 +118,32 @@ class PathFormat(): if WINDOWS: self.extended = config("path-extended", True) + self.basedirectory_conditions = None basedir = extractor._parentdir if not basedir: basedir = config("base-directory") - sep = os.sep if basedir is None: - basedir = f".{sep}gallery-dl{sep}" + basedir = self.clean_path(f".{os.sep}gallery-dl{os.sep}") elif basedir: - basedir = util.expand_path(basedir) - altsep = os.altsep - if altsep and altsep in basedir: - basedir = basedir.replace(altsep, sep) - if basedir[-1] != sep: - basedir += sep - basedir = self.clean_path(basedir) + if isinstance(basedir, dict): + self.basedirectory_conditions = conds = [] + for expr, bdir in basedir.items(): + if not expr: + basedir = bdir + continue + conds.append((util.compile_filter(expr), + self._prepare_basedirectory(bdir))) + basedir = self._prepare_basedirectory(basedir) self.basedirectory = basedir + def _prepare_basedirectory(self, basedir): + basedir = util.expand_path(basedir) + if os.altsep and os.altsep in basedir: + basedir = basedir.replace(os.altsep, os.sep) + if basedir[-1] != os.sep: + basedir += os.sep + return self.clean_path(basedir) + def __str__(self): return self.realpath @@ -175,11 +185,20 @@ class PathFormat(): """Build directory path and create it if necessary""" self.kwdict = kwdict + if self.basedirectory_conditions is None: + basedir = self.basedirectory + else: + for condition, basedir in self.basedirectory_conditions: + if condition(kwdict): + break + else: + basedir = self.basedirectory + if segments := self.build_directory(kwdict): - self.directory = directory = self.basedirectory + self.clean_path( - os.sep.join(segments) + os.sep) + self.directory = directory = \ + f"{basedir}{self.clean_path(os.sep.join(segments))}{os.sep}" else: - self.directory = directory = self.basedirectory + self.directory = directory = basedir if WINDOWS and self.extended: directory = self._extended_path(directory) diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index ef11bff..3b0ab22 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -55,7 +55,8 @@ class ExecPP(PostProcessor): def _prepare_cmd(self, cmd): if isinstance(cmd, str): - self._sub = util.re(r"\{(_directory|_filename|_path|)\}").sub + self._sub = util.re( + r"\{(_directory|_filename|_(?:temp)?path|)\}").sub return self.exec_string, cmd else: return self.exec_list, [formatter.parse(arg) for arg in cmd] @@ -69,6 +70,7 @@ class ExecPP(PostProcessor): kwdict["_directory"] = pathfmt.realdirectory kwdict["_filename"] = pathfmt.filename + kwdict["_temppath"] = pathfmt.temppath kwdict["_path"] = pathfmt.realpath args = [arg.format_map(kwdict) for arg in self.args] @@ -131,6 +133,8 @@ class ExecPP(PostProcessor): return quote(self.pathfmt.realdirectory) if name == "_filename": return quote(self.pathfmt.filename) + if name == "_temppath": + return quote(self.pathfmt.temppath) return quote(self.pathfmt.realpath) diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 33ebb75..1a55e22 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -386,7 +386,7 @@ class UgoiraPP(PostProcessor): content.append("") ffconcat = tempdir + "/ffconcat.txt" - with open(ffconcat, "w") as fp: + with open(ffconcat, "w", encoding="utf-8") as fp: fp.write("\n".join(content)) return ffconcat @@ -401,7 +401,7 @@ class UgoiraPP(PostProcessor): content.append("") timecodes = tempdir + "/timecodes.tc" - with open(timecodes, "w") as fp: + with open(timecodes, "w", encoding="utf-8") as fp: fp.write("\n".join(content)) return timecodes diff --git a/gallery_dl/text.py b/gallery_dl/text.py index a7539ad..98bba48 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -267,7 +267,7 @@ def parse_float(value, default=0.0): return default -def parse_query(qs): +def parse_query(qs, empty=False): """Parse a query string into name-value pairs Ignore values whose name has been seen before @@ -279,7 +279,7 @@ def parse_query(qs): try: for name_value in qs.split("&"): name, eq, value = name_value.partition("=") - if eq: + if eq or empty: name = unquote(name.replace("+", " ")) if name not in result: result[name] = unquote(value.replace("+", " ")) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d3e0277..bc70f74 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.30.9" +__version__ = "1.30.10" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 0296498..b7ee1ca 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -31,7 +31,7 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None): module.YoutubeDL.deprecation_warning = util.false if cfg := config("config-file"): - with open(util.expand_path(cfg)) as fp: + with open(util.expand_path(cfg), encoding="utf-8") as fp: contents = fp.read() argv = shlex.split(contents, comments=True) |
