diff options
| author | 2024-09-28 20:01:25 -0400 | |
|---|---|---|
| committer | 2024-09-28 20:01:25 -0400 | |
| commit | 1a457ed68769880ab7760e0746f0cbbd9ca00487 (patch) | |
| tree | a5e2f36fa6537e24a7a8851dab900ea03efdbd00 /gallery_dl | |
| parent | 1f3ffe32342852fd9ea9e7704022488f3a1222bd (diff) | |
New upstream version 1.27.5.upstream/1.27.5
Diffstat (limited to 'gallery_dl')
27 files changed, 1438 insertions, 166 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 663fe99..7a9e0be 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -202,12 +202,18 @@ def main(): extractor.modules.append("") sys.stdout.write("\n".join(extractor.modules)) - elif args.list_extractors: + elif args.list_extractors is not None: write = sys.stdout.write fmt = ("{}{}\nCategory: {} - Subcategory: {}" "\nExample : {}\n\n").format - for extr in extractor.extractors(): + extractors = extractor.extractors() + if args.list_extractors: + fltr = util.build_extractor_filter( + args.list_extractors, negate=False) + extractors = filter(fltr, extractors) + + for extr in extractors: write(fmt( extr.__name__, "\n" + extr.__doc__ if extr.__doc__ else "", diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index deb7c7b..0ffd29a 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -50,21 +50,27 @@ def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): sql = ("SELECT name, value, host, path, isSecure, expiry " "FROM moz_cookies") - parameters = () + conditions = [] + parameters = [] if container_id is False: - sql += " WHERE NOT INSTR(originAttributes,'userContextId=')" + conditions.append("NOT INSTR(originAttributes,'userContextId=')") elif container_id: - sql += " WHERE originAttributes LIKE ? OR originAttributes LIKE ?" + conditions.append( + "originAttributes LIKE ? OR originAttributes LIKE ?") uid = "%userContextId={}".format(container_id) - parameters = (uid, uid + "&%") - elif domain: + parameters += (uid, uid + "&%") + + if domain: if domain[0] == ".": - sql += " WHERE host == ? OR host LIKE ?" - parameters = (domain[1:], "%" + domain) + conditions.append("host == ? OR host LIKE ?") + parameters += (domain[1:], "%" + domain) else: - sql += " WHERE host == ? OR host == ?" - parameters = (domain, "." + domain) + conditions.append("host == ? OR host == ?") + parameters += (domain, "." + domain) + + if conditions: + sql = "{} WHERE ( {} )".format(sql, " ) AND ( ".join(conditions)) set_cookie = cookiejar.set_cookie for name, value, domain, path, secure, expires in db.execute( diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index b3bec21..950a72f 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -45,7 +45,7 @@ class YoutubeDLDownloader(DownloaderBase): except (ImportError, SyntaxError) as exc: self.log.error("Cannot import module '%s'", getattr(exc, "name", "")) - self.log.debug("", exc_info=True) + self.log.debug("", exc_info=exc) self.download = lambda u, p: False return False self.ytdl_instance = ytdl_instance = ytdl.construct_YoutubeDL( @@ -64,8 +64,8 @@ class YoutubeDLDownloader(DownloaderBase): if not info_dict: try: info_dict = ytdl_instance.extract_info(url[5:], download=False) - except Exception: - pass + except Exception as exc: + self.log.debug("", exc_info=exc) if not info_dict: return False @@ -120,8 +120,8 @@ class YoutubeDLDownloader(DownloaderBase): self.out.start(pathfmt.path) try: ytdl_instance.process_info(info_dict) - except Exception: - self.log.debug("Traceback", exc_info=True) + except Exception as exc: + self.log.debug("", exc_info=exc) return False return True diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index a5e8b27..afa3a69 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -9,9 +9,9 @@ """Extractors for https://8chan.moe/""" from .common import Extractor, Message -from .. import text +from .. import text, util from ..cache import memcache -from datetime import datetime, timedelta +from datetime import timedelta import itertools BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)" @@ -27,21 +27,23 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - self.cookies.set( - "TOS20240718", "1", domain=self.root.rpartition("/")[2]) + now = util.datetime_utcnow() + domain = self.root.rpartition("/")[2] + self.cookies.set("TOS20240928", "1", domain=domain) + self.cookies.set(now.strftime("TOS%Y%m%d"), "1", domain=domain) @memcache() def cookies_prepare(self): # fetch captcha cookies # (necessary to download without getting interrupted) - now = datetime.utcnow() + now = util.datetime_utcnow() url = self.root + "/captcha.js" params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")} self.request(url, params=params).content # adjust cookies # - remove 'expires' timestamp - # - move 'captchaexpiration' value forward by 1 month) + # - move 'captchaexpiration' value forward by 1 month domain = self.root.rpartition("/")[2] for cookie in self.cookies: if cookie.domain.endswith(domain): @@ -79,7 +81,7 @@ class _8chanThreadExtractor(_8chanExtractor): self.cookies = self.cookies_prepare() except Exception as exc: self.log.debug("Failed to fetch captcha cookies: %s: %s", - exc.__class__.__name__, exc, exc_info=True) + exc.__class__.__name__, exc, exc_info=exc) # download files posts = thread.pop("posts", ()) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index e103cb1..826771c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -23,6 +23,7 @@ modules = [ "8muses", "adultempire", "agnph", + "ao3", "architizer", "artstation", "aryion", @@ -35,6 +36,8 @@ modules = [ "catbox", "chevereto", "cien", + "civitai", + "cohost", "comicvine", "cyberdrop", "danbooru", diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py new file mode 100644 index 0000000..1f570e8 --- /dev/null +++ b/gallery_dl/extractor/ao3.py @@ -0,0 +1,302 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://archiveofourown.org/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" + r"a(?:rchiveofourown|o3)\.(?:org|com|net)") + + +class Ao3Extractor(Extractor): + """Base class for ao3 extractors""" + category = "ao3" + root = "https://archiveofourown.org" + categorytransfer = True + cookies_domain = ".archiveofourown.org" + cookies_names = ("remember_user_token",) + request_interval = (0.5, 1.5) + + def items(self): + self.login() + + base = self.root + "/works/" + data = {"_extractor": Ao3WorkExtractor} + + for work_id in self.works(): + yield Message.Queue, base + work_id, data + + def works(self): + return self._pagination(self.groups[0]) + + def login(self): + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + @cache(maxage=90*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/users/login" + page = self.request(url).text + + pos = page.find('id="loginform"') + token = text.extract( + page, ' name="authenticity_token" value="', '"', pos)[0] + if not token: + self.log.error("Unable to extract 'authenticity_token'") + + data = { + "authenticity_token": text.unescape(token), + "user[login]" : username, + "user[password]" : password, + "user[remember_me]" : "1", + "commit" : "Log In", + } + + response = self.request(url, method="POST", data=data) + if not response.history: + raise exception.AuthenticationError() + + remember = response.history[0].cookies.get("remember_user_token") + if not remember: + raise exception.AuthenticationError() + + return { + "remember_user_token": remember, + "user_credentials" : "1", + } + + def _pagination(self, path, needle='<li id="work_'): + while True: + page = self.request(self.root + path).text + yield from text.extract_iter(page, needle, '"') + path = text.extr(page, '<a rel="next" href="', '"') + if not path: + return + path = text.unescape(path) + + +class Ao3WorkExtractor(Ao3Extractor): + """Extractor for an AO3 work""" + subcategory = "work" + directory_fmt = ("{category}", "{author}") + filename_fmt = "{id} {title}.{extension}" + archive_fmt = "{id}.{extension}" + pattern = BASE_PATTERN + r"/works/(\d+)" + example = "https://archiveofourown.org/works/12345" + + def _init(self): + formats = self.config("formats") + if formats is None: + self.formats = ("pdf",) + elif not formats: + self.formats = () + elif isinstance(formats, str): + self.formats = formats.lower().replace(" ", "").split(",") + else: + self.formats = formats + + self.cookies.set("view_adult", "true", domain="archiveofourown.org") + + def items(self): + self.login() + + work_id = self.groups[0] + url = "{}/works/{}".format(self.root, work_id) + response = self.request(url, notfound="work") + + if response.url.endswith("/users/login?restricted=true"): + raise exception.AuthorizationError( + "Login required to access member-only works") + page = response.text + if len(page) < 20000 and \ + '<h2 class="landmark heading">Adult Content Warning</' in page: + raise exception.StopExtraction("Adult Content") + + extr = text.extract_from(page) + + chapters = {} + cindex = extr(' id="chapter_index"', "</ul>") + for ch in text.extract_iter(cindex, ' value="', "</option>"): + cid, _, cname = ch.partition('">') + chapters[cid] = text.unescape(cname) + + fmts = {} + path = "" + download = extr(' class="download"', "</ul>") + for dl in text.extract_iter(download, ' href="', "</"): + path, _, type = dl.rpartition('">') + fmts[type.lower()] = path + + data = { + "id" : text.parse_int(work_id), + "rating" : text.split_html( + extr('<dd class="rating tags">', "</dd>")), + "warnings" : text.split_html( + extr('<dd class="warning tags">', "</dd>")), + "categories" : text.split_html( + extr('<dd class="category tags">', "</dd>")), + "fandom" : text.split_html( + extr('<dd class="fandom tags">', "</dd>")), + "relationships": text.split_html( + extr('<dd class="relationship tags">', "</dd>")), + "characters" : text.split_html( + extr('<dd class="character tags">', "</dd>")), + "tags" : text.split_html( + extr('<dd class="freeform tags">', "</dd>")), + "lang" : extr('<dd class="language" lang="', '"'), + "series" : extr('<dd class="series">', "</dd>"), + "date" : text.parse_datetime( + extr('<dd class="published">', "<"), "%Y-%m-%d"), + "date_completed": text.parse_datetime( + extr('>Completed:</dt><dd class="status">', "<"), "%Y-%m-%d"), + "date_updated" : text.parse_timestamp( + path.rpartition("updated_at=")[2]), + "words" : text.parse_int( + extr('<dd class="words">', "<").replace(",", "")), + "chapters" : chapters, + "comments" : text.parse_int( + extr('<dd class="comments">', "<").replace(",", "")), + "likes" : text.parse_int( + extr('<dd class="kudos">', "<").replace(",", "")), + "bookmarks" : text.parse_int(text.remove_html( + extr('<dd class="bookmarks">', "</dd>")).replace(",", "")), + "views" : text.parse_int( + extr('<dd class="hits">', "<").replace(",", "")), + "title" : text.unescape(text.remove_html( + extr(' class="title heading">', "</h2>")).strip()), + "author" : text.unescape(text.remove_html( + extr(' class="byline heading">', "</h3>"))), + "summary" : text.split_html( + extr(' class="heading">Summary:</h3>', "</div>")), + } + data["language"] = util.code_to_language(data["lang"]) + + series = data["series"] + if series: + extr = text.extract_from(series) + data["series"] = { + "prev" : extr(' class="previous" href="/works/', '"'), + "index": extr(' class="position">Part ', " "), + "id" : extr(' href="/series/', '"'), + "name" : text.unescape(extr(">", "<")), + "next" : extr(' class="next" href="/works/', '"'), + } + else: + data["series"] = None + + yield Message.Directory, data + for fmt in self.formats: + try: + url = text.urljoin(self.root, fmts[fmt]) + except KeyError: + self.log.warning("%s: Format '%s' not available", work_id, fmt) + else: + yield Message.Url, url, text.nameext_from_url(url, data) + + +class Ao3SeriesExtractor(Ao3Extractor): + """Extractor for AO3 works of a series""" + subcategory = "series" + pattern = BASE_PATTERN + r"(/series/(\d+))" + example = "https://archiveofourown.org/series/12345" + + +class Ao3TagExtractor(Ao3Extractor): + """Extractor for AO3 works by tag""" + subcategory = "tag" + pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)" + example = "https://archiveofourown.org/tags/TAG/works" + + +class Ao3SearchExtractor(Ao3Extractor): + """Extractor for AO3 search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/works/search/?\?.+)" + example = "https://archiveofourown.org/works/search?work_search[query]=air" + + +class Ao3UserExtractor(Ao3Extractor): + """Extractor for an AO3 user profile""" + subcategory = "user" + pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" + r"(?:/profile)?/?(?:$|\?|#)") + example = "https://archiveofourown.org/users/USER" + + def initialize(self): + pass + + def items(self): + base = "{}/users/{}/".format(self.root, self.groups[0]) + return self._dispatch_extractors(( + (Ao3UserWorksExtractor , base + "works"), + (Ao3UserSeriesExtractor , base + "series"), + (Ao3UserBookmarkExtractor, base + "bookmarks"), + ), ("user-works", "user-series")) + + +class Ao3UserWorksExtractor(Ao3Extractor): + """Extractor for works of an AO3 user""" + subcategory = "user-works" + pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + r"works(?:/?\?.+)?)") + example = "https://archiveofourown.org/users/USER/works" + + +class Ao3UserSeriesExtractor(Ao3Extractor): + """Extractor for series of an AO3 user""" + subcategory = "user-series" + pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + r"series(?:/?\?.+)?)") + example = "https://archiveofourown.org/users/USER/series" + + def items(self): + self.login() + + base = self.root + "/series/" + data = {"_extractor": Ao3SeriesExtractor} + + for series_id in self.series(): + yield Message.Queue, base + series_id, data + + def series(self): + return self._pagination(self.groups[0], '<li id="series_') + + +class Ao3UserBookmarkExtractor(Ao3Extractor): + """Extractor for bookmarked works of an AO3 user""" + subcategory = "user-bookmark" + pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + r"bookmarks(?:/?\?.+)?)") + example = "https://archiveofourown.org/users/USER/bookmarks" + + def items(self): + self.login() + + base = self.root + "/" + data_work = {"_extractor": Ao3WorkExtractor} + data_series = {"_extractor": Ao3SeriesExtractor} + + for item in self._pagination( + self.groups[0], '<span class="count"><a href="/'): + path = item.rpartition("/")[0] + url = base + path + if item.startswith("works/"): + yield Message.Queue, url, data_work + elif item.startswith("series/"): + yield Message.Queue, url, data_series + else: + self.log.warning("Unsupported bookmark type '%s'", path) diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index c97bf65..39c5635 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -41,6 +41,7 @@ class BlueskyExtractor(Extractor): self.api = BlueskyAPI(self) self._user = self._user_did = None self.instance = self.root.partition("://")[2] + self.videos = self.config("videos", True) def items(self): for post in self.posts(): @@ -55,14 +56,6 @@ class BlueskyExtractor(Extractor): post.update(post["record"]) del post["record"] - images = () - if "embed" in post: - media = post["embed"] - if "media" in media: - media = media["media"] - if "images" in media: - images = media["images"] - if self._metadata_facets: if "facets" in post: post["hashtags"] = tags = [] @@ -82,45 +75,66 @@ class BlueskyExtractor(Extractor): if self._metadata_user: post["user"] = self._user or post["author"] + files = self._extract_files(post) post["instance"] = self.instance post["post_id"] = pid - post["count"] = len(images) + post["count"] = len(files) post["date"] = text.parse_datetime( post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") yield Message.Directory, post - if not images: + if not files: continue base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" "?did={}&cid=".format(post["author"]["did"])) - post["num"] = 0 - - for file in images: - post["num"] += 1 - post["description"] = file["alt"] - - try: - aspect = file["aspectRatio"] - post["width"] = aspect["width"] - post["height"] = aspect["height"] - except KeyError: - post["width"] = post["height"] = 0 - - image = file["image"] - try: - cid = image["ref"]["$link"] - except KeyError: - cid = image["cid"] - post["filename"] = cid - post["extension"] = image["mimeType"].rpartition("/")[2] - - yield Message.Url, base + cid, post + for post["num"], file in enumerate(files, 1): + post.update(file) + yield Message.Url, base + file["filename"], post def posts(self): return () + def _extract_files(self, post): + if "embed" not in post: + return () + + files = [] + media = post["embed"] + if "media" in media: + media = media["media"] + + if "images" in media: + for image in media["images"]: + files.append(self._extract_media(image, "image")) + if "video" in media and self.videos: + files.append(self._extract_media(media, "video")) + + return files + + def _extract_media(self, media, key): + try: + aspect = media["aspectRatio"] + width = aspect["width"] + height = aspect["height"] + except KeyError: + width = height = 0 + + data = media[key] + try: + cid = data["ref"]["$link"] + except KeyError: + cid = data["cid"] + + return { + "description": media.get("alt") or "", + "width" : width, + "height" : height, + "filename" : cid, + "extension" : data["mimeType"].rpartition("/")[2], + } + def _make_post(self, actor, kind): did = self.api._did_from_actor(actor) profile = self.api.get_profile(did) diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index ef5a44c..102945b 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -35,7 +35,7 @@ class CheveretoExtractor(BaseExtractor): BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { - "root": "https://jpg4.su", + "root": "https://jpg5.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, "imgkiwi": { diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py new file mode 100644 index 0000000..3e657d6 --- /dev/null +++ b/gallery_dl/extractor/civitai.py @@ -0,0 +1,490 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.civitai.com/""" + +from .common import Extractor, Message +from .. import text, util +import itertools +import time + +BASE_PATTERN = r"(?:https?://)?civitai\.com" +USER_PATTERN = BASE_PATTERN + r"/user/([^/?#]+)" + + +class CivitaiExtractor(Extractor): + """Base class for civitai extractors""" + category = "civitai" + root = "https://civitai.com" + directory_fmt = ("{category}", "{username|user[username]}", "images") + filename_fmt = "{id}.{extension}" + archive_fmt = "{hash}" + request_interval = (0.5, 1.5) + + def _init(self): + if self.config("api") == "trpc": + self.log.debug("Using tRPC API") + self.api = CivitaiTrpcAPI(self) + else: + self.log.debug("Using REST API") + self.api = CivitaiRestAPI(self) + + quality = self.config("quality") + if quality: + if not isinstance(quality, str): + quality = ",".join(quality) + self._image_quality = quality + self._image_ext = ("png" if quality == "original=true" else "jpg") + else: + self._image_quality = "original=true" + self._image_ext = "png" + + def items(self): + models = self.models() + if models: + data = {"_extractor": CivitaiModelExtractor} + for model in models: + url = "{}/models/{}".format(self.root, model["id"]) + yield Message.Queue, url, data + return + + images = self.images() + if images: + for image in images: + url = self._url(image) + image["date"] = text.parse_datetime( + image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + text.nameext_from_url(url, image) + image["extension"] = self._image_ext + yield Message.Directory, image + yield Message.Url, url, image + return + + def models(self): + return () + + def images(self): + return () + + def _url(self, image): + url = image["url"] + if "/" in url: + parts = url.rsplit("/", 2) + parts[1] = self._image_quality + return "/".join(parts) + + name = image.get("name") + if not name: + mime = image.get("mimeType") or self._image_ext + name = "{}.{}".format(image.get("id"), mime.rpartition("/")[2]) + return ( + "https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/{}/{}/{}".format( + url, self._image_quality, name) + ) + + +class CivitaiModelExtractor(CivitaiExtractor): + subcategory = "model" + directory_fmt = ("{category}", "{user[username]}", + "{model[id]}{model[name]:? //}", + "{version[id]}{version[name]:? //}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{file[hash]}" + pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?" + example = "https://civitai.com/models/12345/TITLE" + + def items(self): + model_id, version_id = self.groups + model = self.api.model(model_id) + + if "user" in model: + user = model["user"] + del model["user"] + else: + user = model["creator"] + del model["creator"] + versions = model["modelVersions"] + del model["modelVersions"] + + if version_id: + version_id = int(version_id) + for version in versions: + if version["id"] == version_id: + break + else: + version = self.api.model_version(version_id) + versions = (version,) + + for version in versions: + version["date"] = text.parse_datetime( + version["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + data = { + "model" : model, + "version": version, + "user" : user, + } + + yield Message.Directory, data + for file in self._extract_files(model, version, user): + file.update(data) + yield Message.Url, file["url"], file + + def _extract_files(self, model, version, user): + filetypes = self.config("files") + if filetypes is None: + return self._extract_files_image(model, version, user) + + generators = { + "model" : self._extract_files_model, + "image" : self._extract_files_image, + "gallery" : self._extract_files_gallery, + "gallerie": self._extract_files_gallery, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + + return itertools.chain.from_iterable( + generators[ft.rstrip("s")](model, version, user) + for ft in filetypes + ) + + def _extract_files_model(self, model, version, user): + return [ + { + "num" : num, + "file" : file, + "filename" : file["name"], + "extension": "bin", + "url" : file["downloadUrl"], + "_http_headers" : { + "Authorization": self.api.headers.get("Authorization")}, + "_http_validate": self._validate_file_model, + } + for num, file in enumerate(version["files"], 1) + ] + + def _extract_files_image(self, model, version, user): + if "images" in version: + images = version["images"] + else: + params = { + "modelVersionId": version["id"], + "prioritizedUserIds": [user["id"]], + "period": "AllTime", + "sort": "Most Reactions", + "limit": 20, + "pending": True, + } + images = self.api.images(params, defaults=False) + + return [ + text.nameext_from_url(file["url"], { + "num" : num, + "file": file, + "url" : self._url(file), + }) + for num, file in enumerate(images, 1) + ] + + def _extract_files_gallery(self, model, version, user): + images = self.api.images_gallery(model, version, user) + for num, file in enumerate(images, 1): + yield text.nameext_from_url(file["url"], { + "num" : num, + "file": file, + "url" : self._url(file), + }) + + def _validate_file_model(self, response): + if response.headers.get("Content-Type", "").startswith("text/html"): + alert = text.extr( + response.text, 'mantine-Alert-message">', "</div></div></div>") + if alert: + msg = "\"{}\" - 'api-key' required".format( + text.remove_html(alert)) + else: + msg = "'api-key' required to download this file" + self.log.warning(msg) + return False + return True + + +class CivitaiImageExtractor(CivitaiExtractor): + subcategory = "image" + pattern = BASE_PATTERN + r"/images/(\d+)" + example = "https://civitai.com/images/12345" + + def images(self): + return self.api.image(self.groups[0]) + + +class CivitaiTagModelsExtractor(CivitaiExtractor): + subcategory = "tag-models" + pattern = BASE_PATTERN + r"/(?:tag/|models\?tag=)([^/?&#]+)" + example = "https://civitai.com/tag/TAG" + + def models(self): + tag = text.unquote(self.groups[0]) + return self.api.models({"tag": tag}) + + +class CivitaiTagImagesExtractor(CivitaiExtractor): + subcategory = "tag-images" + pattern = BASE_PATTERN + r"/images\?tags=([^&#]+)" + example = "https://civitai.com/images?tags=12345" + + def images(self): + tag = text.unquote(self.groups[0]) + return self.api.images({"tag": tag}) + + +class CivitaiSearchExtractor(CivitaiExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/search/models\?([^#]+)" + example = "https://civitai.com/search/models?query=QUERY" + + def models(self): + params = text.parse_query(self.groups[0]) + return self.api.models(params) + + +class CivitaiUserExtractor(CivitaiExtractor): + subcategory = "user" + pattern = USER_PATTERN + r"/?(?:$|\?|#)" + example = "https://civitai.com/user/USER" + + def initialize(self): + pass + + def items(self): + base = "{}/user/{}/".format(self.root, self.groups[0]) + return self._dispatch_extractors(( + (CivitaiUserModelsExtractor, base + "models"), + (CivitaiUserImagesExtractor, base + "images"), + ), ("user-models", "user-images")) + + +class CivitaiUserModelsExtractor(CivitaiExtractor): + subcategory = "user-models" + pattern = USER_PATTERN + r"/models/?(?:\?([^#]+))?" + example = "https://civitai.com/user/USER/models" + + def models(self): + params = text.parse_query(self.groups[1]) + params["username"] = text.unquote(self.groups[0]) + return self.api.models(params) + + +class CivitaiUserImagesExtractor(CivitaiExtractor): + subcategory = "user-images" + pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?" + example = "https://civitai.com/user/USER/images" + + def images(self): + params = text.parse_query(self.groups[1]) + params["username"] = text.unquote(self.groups[0]) + return self.api.images(params) + + +class CivitaiRestAPI(): + """Interface for the Civitai Public REST API + + https://developer.civitai.com/docs/api/public-rest + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api" + self.headers = {"Content-Type": "application/json"} + + api_key = extractor.config("api-key") + if api_key: + extractor.log.debug("Using api_key authentication") + self.headers["Authorization"] = "Bearer " + api_key + + nsfw = extractor.config("nsfw") + if nsfw is None or nsfw is True: + nsfw = "X" + elif not nsfw: + nsfw = "Safe" + self.nsfw = nsfw + + def image(self, image_id): + return self.images({ + "imageId": image_id, + }) + + def images(self, params): + endpoint = "/v1/images" + if "nsfw" not in params: + params["nsfw"] = self.nsfw + return self._pagination(endpoint, params) + + def images_gallery(self, model, version, user): + return self.images({ + "modelId" : model["id"], + "modelVersionId": version["id"], + }) + + def model(self, model_id): + endpoint = "/v1/models/{}".format(model_id) + return self._call(endpoint) + + def model_version(self, model_version_id): + endpoint = "/v1/model-versions/{}".format(model_version_id) + return self._call(endpoint) + + def models(self, params): + return self._pagination("/v1/models", params) + + def _call(self, endpoint, params=None): + if endpoint[0] == "/": + url = self.root + endpoint + else: + url = endpoint + + response = self.extractor.request( + url, params=params, headers=self.headers) + return response.json() + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + yield from data["items"] + + try: + endpoint = data["metadata"]["nextPage"] + except KeyError: + return + params = None + + +class CivitaiTrpcAPI(): + """Interface for the Civitai TRPC API""" + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api/trpc/" + self.headers = { + "content-type" : "application/json", + "x-client-version": "5.0.94", + "x-client-date" : "", + "x-client" : "web", + "x-fingerprint" : "undefined", + } + api_key = extractor.config("api-key") + if api_key: + extractor.log.debug("Using api_key authentication") + self.headers["Authorization"] = "Bearer " + api_key + + nsfw = extractor.config("nsfw") + if nsfw is None or nsfw is True: + nsfw = 31 + elif not nsfw: + nsfw = 1 + self.nsfw = nsfw + + def image(self, image_id): + endpoint = "image.get" + params = {"id": int(image_id)} + return (self._call(endpoint, params),) + + def images(self, params, defaults=True): + endpoint = "image.getInfinite" + + if defaults: + params_ = { + "useIndex" : True, + "period" : "AllTime", + "sort" : "Newest", + "types" : ["image"], + "withMeta" : False, # Metadata Only + "fromPlatform" : False, # Made On-Site + "browsingLevel": self.nsfw, + "include" : ["cosmetics"], + } + params_.update(params) + else: + params_ = params + + return self._pagination(endpoint, params_) + + def images_gallery(self, model, version, user): + endpoint = "image.getImagesAsPostsInfinite" + params = { + "period" : "AllTime", + "sort" : "Newest", + "modelVersionId": version["id"], + "modelId" : model["id"], + "hidden" : False, + "limit" : 50, + "browsingLevel" : self.nsfw, + } + + for post in self._pagination(endpoint, params): + yield from post["images"] + + def model(self, model_id): + endpoint = "model.getById" + params = {"id": int(model_id)} + return self._call(endpoint, params) + + def model_version(self, model_version_id): + endpoint = "modelVersion.getById" + params = {"id": int(model_version_id)} + return self._call(endpoint, params) + + def models(self, params, defaults=True): + endpoint = "model.getAll" + + if defaults: + params_ = { + "period" : "AllTime", + "periodMode" : "published", + "sort" : "Newest", + "pending" : False, + "hidden" : False, + "followed" : False, + "earlyAccess" : False, + "fromPlatform" : False, + "supportsGeneration": False, + "browsingLevel": self.nsfw, + } + params_.update(params) + else: + params_ = params + + return self._pagination(endpoint, params_) + + def user(self, username): + endpoint = "user.getCreator" + params = {"username": username} + return (self._call(endpoint, params),) + + def _call(self, endpoint, params): + url = self.root + endpoint + headers = self.headers + params = {"input": util.json_dumps({"json": params})} + + headers["x-client-date"] = str(int(time.time() * 1000)) + response = self.extractor.request(url, headers=headers, params=params) + + return response.json()["result"]["data"]["json"] + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + yield from data["items"] + + try: + if not data["nextCursor"]: + return + params["cursor"] = data["nextCursor"] + except KeyError: + return diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py new file mode 100644 index 0000000..e1f6040 --- /dev/null +++ b/gallery_dl/extractor/cohost.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://cohost.org/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cohost\.org" + + +class CohostExtractor(Extractor): + """Base class for cohost extractors""" + category = "cohost" + root = "https://cohost.org" + directory_fmt = ("{category}", "{postingProject[handle]}") + filename_fmt = ("{postId}_{headline|plainTextBody:?/_/[:100]}" + "{num}.{extension}") + archive_fmt = "{postId}_{num}" + + def _init(self): + self.replies = self.config("replies", True) + self.pinned = self.config("pinned", False) + self.shares = self.config("shares", False) + self.asks = self.config("asks", True) + + def items(self): + for post in self.posts(): + reason = post.get("limitedVisibilityReason") + if reason and reason != "none": + if reason == "log-in-first": + reason = ("This page's posts are visible only to users " + "who are logged in.") + self.log.warning('%s: "%s"', post["postId"], reason) + + files = self._extract_files(post) + post["count"] = len(files) + post["date"] = text.parse_datetime( + post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + url = file["fileURL"] + post.update(file) + text.nameext_from_url(url, post) + yield Message.Url, url, post + + def posts(self): + return () + + def _request_api(self, endpoint, input): + url = "{}/api/v1/trpc/{}".format(self.root, endpoint) + params = {"batch": "1", "input": util.json_dumps({"0": input})} + headers = {"content-type": "application/json"} + + data = self.request(url, params=params, headers=headers).json() + return data[0]["result"]["data"] + + def _extract_files(self, post): + files = [] + + self._extract_blocks(post, files) + if self.shares and post.get("shareTree"): + for share in post["shareTree"]: + self._extract_blocks(share, files, share) + del post["shareTree"] + + return files + + def _extract_blocks(self, post, files, shared=None): + post["content"] = content = [] + + for block in post.pop("blocks") or (): + try: + type = block["type"] + if type == "attachment": + file = block["attachment"].copy() + file["shared"] = shared + files.append(file) + elif type == "attachment-row": + for att in block["attachments"]: + file = att["attachment"].copy() + file["shared"] = shared + files.append(file) + elif type == "markdown": + content.append(block["markdown"]["content"]) + elif type == "ask": + post["ask"] = block["ask"] + else: + self.log.debug("%s: Unsupported block type '%s'", + post["postId"], type) + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + + +class CohostUserExtractor(CohostExtractor): + """Extractor for media from a cohost user""" + subcategory = "user" + pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:$|\?|#)" + example = "https://cohost.org/USER" + + def posts(self): + empty = 0 + params = { + "projectHandle": self.groups[0], + "page": 0, + "options": { + "pinnedPostsAtTop" : bool(self.pinned), + "hideReplies" : not self.replies, + "hideShares" : not self.shares, + "hideAsks" : not self.asks, + "viewingOnProjectPage": True, + }, + } + + while True: + data = self._request_api("posts.profilePosts", params) + + posts = data["posts"] + if posts: + empty = 0 + yield from posts + else: + empty += 1 + + pagination = data["pagination"] + if not pagination.get("morePagesForward"): + return + if empty >= 3: + return self.log.debug("Empty API results") + params["page"] = pagination["nextPage"] + + +class CohostPostExtractor(CohostExtractor): + """Extractor for media from a single cohost post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/([^/?#]+)/post/(\d+)" + example = "https://cohost.org/USER/post/12345" + + def posts(self): + endpoint = "posts.singlePost" + params = { + "handle": self.groups[0], + "postId": int(self.groups[1]), + } + + data = self._request_api(endpoint, params) + post = data["post"] + + try: + post["comments"] = data["comments"][self.groups[1]] + except LookupError: + post["comments"] = () + + return (post,) + + +class CohostTagExtractor(CohostExtractor): + """Extractor for tagged posts""" + subcategory = "tag" + pattern = BASE_PATTERN + r"/([^/?#]+)/tagged/([^/?#]+)(?:\?([^#]+))?" + example = "https://cohost.org/USER/tagged/TAG" + + def posts(self): + user, tag, query = self.groups + url = "{}/{}/tagged/{}".format(self.root, user, tag) + params = text.parse_query(query) + post_feed_key = ("tagged-post-feed" if user == "rc" else + "project-tagged-post-feed") + + while True: + page = self.request(url, params=params).text + data = util.json_loads(text.extr( + page, 'id="__COHOST_LOADER_STATE__">', '</script>')) + + try: + feed = data[post_feed_key] + except KeyError: + feed = data.popitem()[1] + + yield from feed["posts"] + + pagination = feed["paginationMode"] + if not pagination.get("morePagesForward"): + return + params["refTimestamp"] = pagination["refTimestamp"] + params["skipPosts"] = \ + pagination["currentSkip"] + pagination["idealPageStride"] + + +class CohostLikesExtractor(CohostExtractor): + """Extractor for liked posts""" + subcategory = "likes" + pattern = BASE_PATTERN + r"/rc/liked-posts" + example = "https://cohost.org/rc/liked-posts" + + def posts(self): + url = "{}/rc/liked-posts".format(self.root) + params = {} + + while True: + page = self.request(url, params=params).text + data = util.json_loads(text.extr( + page, 'id="__COHOST_LOADER_STATE__">', '</script>')) + + try: + feed = data["liked-posts-feed"] + except KeyError: + feed = data.popitem()[1] + + yield from feed["posts"] + + pagination = feed["paginationMode"] + if not pagination.get("morePagesForward"): + return + params["refTimestamp"] = pagination["refTimestamp"] + params["skipPosts"] = \ + pagination["currentSkip"] + pagination["idealPageStride"] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index df70571..32c8e67 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -15,6 +15,7 @@ import sys import time import netrc import queue +import random import getpass import logging import datetime @@ -37,6 +38,7 @@ class Extractor(): archive_fmt = "" root = "" cookies_domain = "" + cookies_index = 0 referer = True ciphers = None tls12 = True @@ -196,6 +198,10 @@ class Extractor(): server = response.headers.get("Server") if server and server.startswith("cloudflare") and \ code in (403, 503): + mitigated = response.headers.get("cf-mitigated") + if mitigated and mitigated.lower() == "challenge": + self.log.warning("Cloudflare challenge") + break content = response.content if b"_cf_chl_opt" in content or b"jschl-answer" in content: self.log.warning("Cloudflare challenge") @@ -439,45 +445,55 @@ class Extractor(): cookies = self.config("cookies") if cookies: - if isinstance(cookies, dict): - self.cookies_update_dict(cookies, self.cookies_domain) + select = self.config("cookies-select") + if select: + if select == "rotate": + cookies = cookies[self.cookies_index % len(cookies)] + Extractor.cookies_index += 1 + else: + cookies = random.choice(cookies) + self.cookies_load(cookies) + + def cookies_load(self, cookies): + if isinstance(cookies, dict): + self.cookies_update_dict(cookies, self.cookies_domain) + + elif isinstance(cookies, str): + path = util.expand_path(cookies) + try: + with open(path) as fp: + util.cookiestxt_load(fp, self.cookies) + except Exception as exc: + self.log.warning("cookies: %s", exc) + else: + self.log.debug("Loading cookies from '%s'", cookies) + self.cookies_file = path - elif isinstance(cookies, str): - path = util.expand_path(cookies) + elif isinstance(cookies, (list, tuple)): + key = tuple(cookies) + cookiejar = _browser_cookies.get(key) + + if cookiejar is None: + from ..cookies import load_cookies + cookiejar = self.cookies.__class__() try: - with open(path) as fp: - util.cookiestxt_load(fp, self.cookies) + load_cookies(cookiejar, cookies) except Exception as exc: self.log.warning("cookies: %s", exc) else: - self.log.debug("Loading cookies from '%s'", cookies) - self.cookies_file = path - - elif isinstance(cookies, (list, tuple)): - key = tuple(cookies) - cookiejar = _browser_cookies.get(key) - - if cookiejar is None: - from ..cookies import load_cookies - cookiejar = self.cookies.__class__() - try: - load_cookies(cookiejar, cookies) - except Exception as exc: - self.log.warning("cookies: %s", exc) - else: - _browser_cookies[key] = cookiejar - else: - self.log.debug("Using cached cookies from %s", key) + _browser_cookies[key] = cookiejar + else: + self.log.debug("Using cached cookies from %s", key) - set_cookie = self.cookies.set_cookie - for cookie in cookiejar: - set_cookie(cookie) + set_cookie = self.cookies.set_cookie + for cookie in cookiejar: + set_cookie(cookie) - else: - self.log.warning( - "Expected 'dict', 'list', or 'str' value for 'cookies' " - "option, got '%s' (%s)", - cookies.__class__.__name__, cookies) + else: + self.log.warning( + "Expected 'dict', 'list', or 'str' value for 'cookies' " + "option, got '%s' (%s)", + cookies.__class__.__name__, cookies) def cookies_store(self): """Store the session's cookies in a cookies.txt file""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index ea70b58..3686e1b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -46,11 +46,13 @@ class DeviantartExtractor(Extractor): self.extra = self.config("extra", False) self.quality = self.config("quality", "100") self.original = self.config("original", True) + self.previews = self.config("previews", False) self.intermediary = self.config("intermediary", True) self.comments_avatars = self.config("comments-avatars", False) self.comments = self.comments_avatars or self.config("comments", False) self.api = DeviantartOAuthAPI(self) + self.eclipse_api = None self.group = False self._premium_cache = {} @@ -76,6 +78,11 @@ class DeviantartExtractor(Extractor): else: self._update_content = self._update_content_default + if self.previews == "all": + self.previews_images = self.previews = True + else: + self.previews_images = False + journals = self.config("journals", "html") if journals == "html": self.commit_journal = self._commit_journal_html @@ -171,8 +178,19 @@ class DeviantartExtractor(Extractor): if self.commit_journal: if "excerpt" in deviation: - journal = self.api.deviation_content( - deviation["deviationid"]) + # journal = self.api.deviation_content( + # deviation["deviationid"]) + if not self.eclipse_api: + self.eclipse_api = DeviantartEclipseAPI(self) + content = self.eclipse_api.deviation_extended_fetch( + deviation["index"], + deviation["author"]["username"], + "journal", + )["deviation"]["textContent"] + html = content["html"]["markup"] + if html.startswith("{"): + html = content["excerpt"].replace("\n", "<br />") + journal = {"html": html} elif "body" in deviation: journal = {"html": deviation.pop("body")} else: @@ -197,6 +215,18 @@ class DeviantartExtractor(Extractor): comment["_extractor"] = DeviantartAvatarExtractor yield Message.Queue, url, comment + if self.previews and "preview" in deviation: + preview = deviation["preview"] + deviation["is_preview"] = True + if self.previews_images: + yield self.commit(deviation, preview) + else: + mtype = mimetypes.guess_type( + "a." + deviation["extension"], False)[0] + if mtype and not mtype.startswith("image/"): + yield self.commit(deviation, preview) + del deviation["is_preview"] + if not self.extra: continue @@ -284,6 +314,9 @@ class DeviantartExtractor(Extractor): html = journal["html"] shadow = SHADOW_TEMPLATE.format_map(thumbs[0]) if thumbs else "" + if not html: + self.log.warning("%s: Empty journal content", deviation["index"]) + if "css" in journal: css, cls = journal["css"], "withskin" elif html.startswith("<style"): @@ -321,10 +354,11 @@ class DeviantartExtractor(Extractor): deviation["extension"] = "htm" return Message.Url, html, deviation - @staticmethod - def _commit_journal_text(deviation, journal): + def _commit_journal_text(self, deviation, journal): html = journal["html"] - if html.startswith("<style"): + if not html: + self.log.warning("%s: Empty journal content", deviation["index"]) + elif html.startswith("<style"): html = html.partition("</style>")[2] head, _, tail = html.rpartition("<script") content = "\n".join( diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 1b4971c..6aefa11 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -37,11 +37,13 @@ class FlickrExtractor(Extractor): extract = self.api._extract_format for photo in self.photos(): try: + 1/0 photo = extract(photo) except Exception as exc: self.log.warning( - "Skipping %s (%s)", photo["id"], exc.__class__.__name__) - self.log.debug("", exc_info=True) + "Skipping photo %s (%s: %s)", + photo["id"], exc.__class__.__name__, exc) + self.log.debug("", exc_info=exc) else: photo.update(data) url = photo["url"] diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index f3098f1..bff3156 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -132,6 +132,7 @@ class InkbunnyPoolExtractor(InkbunnyExtractor): class InkbunnyFavoriteExtractor(InkbunnyExtractor): """Extractor for inkbunny user favorites""" subcategory = "favorite" + directory_fmt = ("{category}", "{favs_username!l}", "Favorites") pattern = (BASE_PATTERN + r"/(?:" r"userfavorites_process\.php\?favs_user_id=(\d+)|" r"submissionsviewall\.php" @@ -151,7 +152,17 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): self.orderby = params.get("orderby", "fav_datetime") def metadata(self): - return {"favs_user_id": self.user_id} + # Lookup fav user ID as username + url = "{}/userfavorites_process.php?favs_user_id={}".format( + self.root, self.user_id) + page = self.request(url).text + user_link = text.extr(page, '<a rel="author"', '</a>') + favs_username = text.extr(user_link, 'href="/', '"') + + return { + "favs_user_id": self.user_id, + "favs_username": favs_username, + } def posts(self): params = { diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 5fc0ce5..dfa1f6e 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -53,8 +53,8 @@ class NewgroundsExtractor(Extractor): try: post = self.extract_post(post_url) url = post.get("url") - except Exception: - self.log.debug("", exc_info=True) + except Exception as exc: + self.log.debug("", exc_info=exc) url = None if url: diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 3479b88..c908e44 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -448,7 +448,8 @@ class PixivRankingExtractor(PixivExtractor): self.log.warning("invalid date '%s'", date) date = None if not date: - date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d") + now = util.datetime_utcnow() + date = (now - timedelta(days=1)).strftime("%Y-%m-%d") self.date = date return {"ranking": { @@ -887,7 +888,7 @@ class PixivAppAPI(): "get_secure_url": "1", } - time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S+00:00") + time = util.datetime_utcnow().strftime("%Y-%m-%dT%H:%M:%S+00:00") headers = { "X-Client-Time": time, "X-Client-Hash": hashlib.md5( diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 6ec44ba..07c9b21 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -26,7 +26,11 @@ class SkebExtractor(Extractor): def _init(self): self.thumbnails = self.config("thumbnails", False) self.article = self.config("article", False) - self.headers = {"Accept": "application/json, text/plain, */*"} + self.headers = { + "Accept": "application/json, text/plain, */*", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + } if "Authorization" not in self.session.headers: self.headers["Authorization"] = "Bearer null" diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index ddbfaa0..13b0520 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -159,24 +159,26 @@ class WeasylJournalsExtractor(WeasylExtractor): class WeasylFavoriteExtractor(WeasylExtractor): subcategory = "favorite" - directory_fmt = ("{category}", "{owner_login}", "Favorites") - pattern = BASE_PATTERN + r"favorites\?userid=(\d+)" + directory_fmt = ("{category}", "{user}", "Favorites") + pattern = BASE_PATTERN + r"favorites(?:\?userid=(\d+)|/([^/?#]+))" example = "https://www.weasyl.com/favorites?userid=12345" - def __init__(self, match): - WeasylExtractor.__init__(self, match) - self.userid = match.group(1) - def items(self): + userid, username = self.groups owner_login = lastid = None - url = self.root + "/favorites" + + if username: + owner_login = username + path = "/favorites/" + username + else: + path = "/favorites" params = { - "userid" : self.userid, + "userid" : userid, "feature": "submit", } while True: - page = self.request(url, params=params).text + page = self.request(self.root + path, params=params).text pos = page.index('id="favorites-content"') if not owner_login: @@ -186,12 +188,16 @@ class WeasylFavoriteExtractor(WeasylExtractor): if submitid == lastid: continue lastid = submitid + submission = self.request_submission(submitid) if self.populate_submission(submission): submission["user"] = owner_login yield Message.Directory, submission yield Message.Url, submission["url"], submission - if "&nextid=" not in page: + try: + pos = page.index('">Next (', pos) + except ValueError: return - params["nextid"] = submitid + path = text.unescape(text.rextract(page, 'href="', '"', pos)[0]) + params = None diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 7a62e01..116f557 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -10,7 +10,8 @@ """Extractors for Wikimedia sites""" from .common import BaseExtractor, Message -from .. import text +from .. import text, exception +from ..cache import cache class WikimediaExtractor(BaseExtractor): @@ -39,7 +40,17 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = api_path else: - self.api_url = self.root + "/api.php" + self.api_url = None + + @cache(maxage=36500*86400, keyarg=1) + def _search_api_path(self, root): + self.log.debug("Probing possible API endpoints") + for path in ("/api.php", "/w/api.php", "/wiki/api.php"): + url = root + path + response = self.request(url, method="HEAD", fatal=None) + if response.status_code < 400: + return url + raise exception.StopExtraction("Unable to find API endpoint") @staticmethod def prepare(image): @@ -76,6 +87,9 @@ class WikimediaExtractor(BaseExtractor): """ url = self.api_url + if not url: + url = self._search_api_path(self.root) + params["action"] = "query" params["format"] = "json" params["prop"] = "imageinfo" @@ -139,14 +153,17 @@ BASE_PATTERN = WikimediaExtractor.update({ "fandom": { "root": None, "pattern": r"[\w-]+\.fandom\.com", + "api-path": "/api.php", }, "wikigg": { "root": None, "pattern": r"\w+\.wiki\.gg", + "api-path": "/api.php", }, "mariowiki": { "root": "https://www.mariowiki.com", "pattern": r"(?:www\.)?mariowiki\.com", + "api-path": "/api.php", }, "bulbapedia": { "root": "https://bulbapedia.bulbagarden.net", diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py index 45b0cd8..05b12b4 100644 --- a/gallery_dl/extractor/zzup.py +++ b/gallery_dl/extractor/zzup.py @@ -4,6 +4,8 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. +"""Extractors for https://zzup.com/""" + from .common import GalleryExtractor from .. import text @@ -11,17 +13,20 @@ from .. import text class ZzupGalleryExtractor(GalleryExtractor): category = "zzup" directory_fmt = ("{category}", "{title}") - filename_fmt = "{slug}_{num:>03}.{extension}" + filename_fmt = "{num:>03}.{extension}" archive_fmt = "{slug}_{num}" root = "https://zzup.com" - pattern = (r"(?:https?://)?(?:www\.)?zzup\.com(/content" + pattern = (r"(?:https?://)?(up\.|www\.)?zzup\.com(/(?:viewalbum|content)" r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html") example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html" def __init__(self, match): - url = "{}/{}/index.html".format(self.root, match.group(1)) + subdomain, path, self.slug = match.groups() + if subdomain == "up.": + self.root = "https://up.zzup.com" + self.images = self.images_v2 + url = "{}{}/index.html".format(self.root, path) GalleryExtractor.__init__(self, match, url) - self.slug = match.group(2) def metadata(self, page): return { @@ -38,3 +43,20 @@ class ZzupGalleryExtractor(GalleryExtractor): p1, _, p2 = url.partition("/image0") ufmt = p1 + "/image{:>05}" + p2[4:] return [(ufmt.format(num), None) for num in range(1, count + 1)] + + def images_v2(self, page): + results = [] + + while True: + for path in text.extract_iter( + page, ' class="picbox"><a target="_blank" href="', '"'): + results.append(("{}/showimage/{}/zzup.com.jpg".format( + self.root, "/".join(path.split("/")[2:-2])), None)) + + pos = page.find("glyphicon-arrow-right") + if pos < 0: + break + path = text.rextract(page, ' href="', '"', pos)[0] + page = self.request(text.urljoin(self.gallery_url, path)).text + + return results diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index f197e5d..e662c34 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -476,6 +476,7 @@ _GLOBALS = { "_env": lambda: os.environ, "_lit": lambda: _literal, "_now": datetime.datetime.now, + "_nul": lambda: util.NONE, } _CONVERSIONS = { "l": str.lower, @@ -484,6 +485,7 @@ _CONVERSIONS = { "C": string.capwords, "j": util.json_dumps, "t": str.strip, + "L": len, "T": util.datetime_to_timestamp_string, "d": text.parse_timestamp, "U": text.unescape, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index c995767..4affd55 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -158,11 +158,12 @@ class Job(): raise except exception.GalleryDLException as exc: log.error("%s: %s", exc.__class__.__name__, exc) + log.debug("", exc_info=exc) self.status |= exc.code except OSError as exc: log.error("Unable to download data: %s: %s", exc.__class__.__name__, exc) - log.debug("", exc_info=True) + log.debug("", exc_info=exc) self.status |= 128 except Exception as exc: log.error(("An unexpected error occurred: %s - %s. " @@ -170,7 +171,7 @@ class Job(): "copy its output and report this issue on " "https://github.com/mikf/gallery-dl/issues ."), exc.__class__.__name__, exc) - log.debug("", exc_info=True) + log.debug("", exc_info=exc) self.status |= 1 except BaseException: self.status |= 1 @@ -641,7 +642,7 @@ class DownloadJob(Job): except Exception as exc: pp_log.error("'%s' initialization failed: %s: %s", name, exc.__class__.__name__, exc) - pp_log.debug("", exc_info=True) + pp_log.debug("", exc_info=exc) else: pp_list.append(pp_obj) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 0189c0e..c4f5b94 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -131,12 +131,17 @@ class UgoiraAction(argparse.Action): "[a] palettegen [p];[b][p] paletteuse"), "repeat-last-frame": False, } - elif value in ("mkv", "copy"): + elif value == "mkv" or value == "copy": pp = { "extension" : "mkv", "ffmpeg-args" : ("-c:v", "copy"), "repeat-last-frame": False, } + elif value == "zip" or value == "archive": + pp = { + "mode" : "archive", + } + namespace.options.append(((), "ugoira", "original")) else: parser.error("Unsupported Ugoira format '{}'".format(value)) @@ -344,7 +349,7 @@ def build_parser(): ) output.add_argument( "--list-extractors", - dest="list_extractors", action="store_true", + dest="list_extractors", metavar="CATEGORIES", nargs="*", help=("Print a list of extractor classes " "with description, (sub)category and example URL"), ) @@ -693,7 +698,7 @@ def build_parser(): dest="postprocessors", metavar="FMT", action=UgoiraAction, help=("Convert Pixiv Ugoira to FMT using FFmpeg. " "Supported formats are 'webm', 'mp4', 'gif', " - "'vp8', 'vp9', 'vp9-lossless', 'copy'."), + "'vp8', 'vp9', 'vp9-lossless', 'copy', 'zip'."), ) postprocessor.add_argument( "--ugoira-conv", diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index f053afa..87a0ba6 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -29,12 +29,12 @@ class UgoiraPP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) - self.extension = options.get("extension") or "webm" self.args = options.get("ffmpeg-args") or () self.twopass = options.get("ffmpeg-twopass", False) self.output = options.get("ffmpeg-output", "error") self.delete = not options.get("keep-files", False) self.repeat = options.get("repeat-last-frame", True) + self.metadata = options.get("metadata", True) self.mtime = options.get("mtime", True) self.skip = options.get("skip", True) self.uniform = self._convert_zip = self._convert_files = False @@ -45,24 +45,31 @@ class UgoiraPP(PostProcessor): mkvmerge = options.get("mkvmerge-location") self.mkvmerge = util.expand_path(mkvmerge) if mkvmerge else "mkvmerge" - demuxer = options.get("ffmpeg-demuxer") - if demuxer is None or demuxer == "auto": - if self.extension in ("webm", "mkv") and ( + ext = options.get("extension") + mode = options.get("mode") or options.get("ffmpeg-demuxer") + if mode is None or mode == "auto": + if ext in (None, "webm", "mkv") and ( mkvmerge or shutil.which("mkvmerge")): - demuxer = "mkvmerge" + mode = "mkvmerge" else: - demuxer = "concat" + mode = "concat" - if demuxer == "mkvmerge": + if mode == "mkvmerge": self._process = self._process_mkvmerge self._finalize = self._finalize_mkvmerge - elif demuxer == "image2": + elif mode == "image2": self._process = self._process_image2 self._finalize = None + elif mode == "archive": + if ext is None: + ext = "zip" + self._convert_impl = self.convert_to_archive + self._tempdir = util.NullContext else: self._process = self._process_concat self._finalize = None - self.log.debug("using %s demuxer", demuxer) + self.extension = "webm" if ext is None else ext + self.log.debug("using %s demuxer", mode) rate = options.get("framerate", "auto") if rate == "uniform": @@ -93,8 +100,8 @@ class UgoiraPP(PostProcessor): job.register_hooks({ "prepare": self.prepare, - "file" : self.convert_zip, - "after" : self.convert_files, + "file" : self.convert_from_zip, + "after" : self.convert_from_files, }, options) def prepare(self, pathfmt): @@ -109,12 +116,15 @@ class UgoiraPP(PostProcessor): pathfmt.set_extension(self.extension) pathfmt.build_path() else: + index = pathfmt.kwdict.get("_ugoira_frame_index") + if index is None: + return + pathfmt.build_path() - index = pathfmt.kwdict["_ugoira_frame_index"] frame = self._frames[index].copy() frame["index"] = index frame["path"] = pathfmt.realpath - frame["ext"] = pathfmt.kwdict["extension"] + frame["ext"] = pathfmt.extension if not index: self._files = [frame] @@ -123,31 +133,34 @@ class UgoiraPP(PostProcessor): if len(self._files) >= len(self._frames): self._convert_files = True - def convert_zip(self, pathfmt): + def convert_from_zip(self, pathfmt): if not self._convert_zip: return self._convert_zip = False + self._zip_source = True - with tempfile.TemporaryDirectory() as tempdir: - try: - with zipfile.ZipFile(pathfmt.temppath) as zfile: - zfile.extractall(tempdir) - except FileNotFoundError: - pathfmt.realpath = pathfmt.temppath - return + with self._tempdir() as tempdir: + if tempdir: + try: + with zipfile.ZipFile(pathfmt.temppath) as zfile: + zfile.extractall(tempdir) + except FileNotFoundError: + pathfmt.realpath = pathfmt.temppath + return if self.convert(pathfmt, tempdir): if self.delete: pathfmt.delete = True - else: + elif pathfmt.extension != "zip": self.log.info(pathfmt.filename) pathfmt.set_extension("zip") pathfmt.build_path() - def convert_files(self, pathfmt): + def convert_from_files(self, pathfmt): if not self._convert_files: return self._convert_files = False + self._zip_source = False with tempfile.TemporaryDirectory() as tempdir: for frame in self._files: @@ -156,13 +169,14 @@ class UgoiraPP(PostProcessor): frame["file"] = name = "{}.{}".format( frame["file"].partition(".")[0], frame["ext"]) - # move frame into tempdir - try: - self._copy_file(frame["path"], tempdir + "/" + name) - except OSError as exc: - self.log.debug("Unable to copy frame %s (%s: %s)", - name, exc.__class__.__name__, exc) - return + if tempdir: + # move frame into tempdir + try: + self._copy_file(frame["path"], tempdir + "/" + name) + except OSError as exc: + self.log.debug("Unable to copy frame %s (%s: %s)", + name, exc.__class__.__name__, exc) + return pathfmt.kwdict["num"] = 0 self._frames = self._files @@ -179,6 +193,9 @@ class UgoiraPP(PostProcessor): if self.skip and pathfmt.exists(): return True + return self._convert_impl(pathfmt, tempdir) + + def convert_to_animation(self, pathfmt, tempdir): # process frames and collect command-line arguments args = self._process(pathfmt, tempdir) if self.args_pp: @@ -206,11 +223,12 @@ class UgoiraPP(PostProcessor): print() self.log.error("Unable to invoke FFmpeg (%s: %s)", exc.__class__.__name__, exc) + self.log.debug("", exc_info=exc) pathfmt.realpath = pathfmt.temppath except Exception as exc: print() self.log.error("%s: %s", exc.__class__.__name__, exc) - self.log.debug("", exc_info=True) + self.log.debug("", exc_info=exc) pathfmt.realpath = pathfmt.temppath else: if self.mtime: @@ -219,6 +237,54 @@ class UgoiraPP(PostProcessor): util.set_mtime(pathfmt.realpath, mtime) return True + def convert_to_archive(self, pathfmt, tempdir): + frames = self._frames + + if self.metadata: + if isinstance(self.metadata, str): + metaname = self.metadata + else: + metaname = "animation.json" + framedata = util.json_dumps([ + {"file": frame["file"], "delay": frame["delay"]} + for frame in frames + ]).encode() + + if self._zip_source: + self.delete = False + if self.metadata: + with zipfile.ZipFile(pathfmt.temppath, "a") as zfile: + zinfo = zipfile.ZipInfo(metaname) + if self.mtime: + zinfo.date_time = zfile.infolist()[0].date_time + with zfile.open(zinfo, "w") as fp: + fp.write(framedata) + else: + if self.mtime: + dt = pathfmt.kwdict["date_url"] or pathfmt.kwdict["date"] + mtime = (dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second) + with zipfile.ZipFile(pathfmt.realpath, "w") as zfile: + for frame in frames: + zinfo = zipfile.ZipInfo.from_file( + frame["path"], frame["file"]) + if self.mtime: + zinfo.date_time = mtime + with open(frame["path"], "rb") as src, \ + zfile.open(zinfo, "w") as dst: + shutil.copyfileobj(src, dst, 1024*8) + if self.metadata: + zinfo = zipfile.ZipInfo(metaname) + if self.mtime: + zinfo.date_time = mtime + with zfile.open(zinfo, "w") as fp: + fp.write(framedata) + + return True + + _convert_impl = convert_to_animation + _tempdir = tempfile.TemporaryDirectory + def _exec(self, args): self.log.debug(args) out = None if self.output else subprocess.DEVNULL diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 9258187..8517cdf 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -9,7 +9,9 @@ """Collection of functions that work on strings/text""" import re +import sys import html +import time import datetime import urllib.parse @@ -247,12 +249,23 @@ def parse_query(qs): return result -def parse_timestamp(ts, default=None): - """Create a datetime object from a unix timestamp""" - try: - return datetime.datetime.utcfromtimestamp(int(ts)) - except Exception: - return default +if sys.hexversion < 0x30c0000: + # Python <= 3.11 + def parse_timestamp(ts, default=None): + """Create a datetime object from a Unix timestamp""" + try: + return datetime.datetime.utcfromtimestamp(int(ts)) + except Exception: + return default +else: + # Python >= 3.12 + def parse_timestamp(ts, default=None): + """Create a datetime object from a Unix timestamp""" + try: + Y, m, d, H, M, S, _, _, _ = time.gmtime(int(ts)) + return datetime.datetime(Y, m, d, H, M, S) + except Exception: + return default def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0): diff --git a/gallery_dl/util.py b/gallery_dl/util.py index ecb496d..128f48b 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -218,18 +218,34 @@ def to_string(value): def datetime_to_timestamp(dt): - """Convert naive UTC datetime to timestamp""" + """Convert naive UTC datetime to Unix timestamp""" return (dt - EPOCH) / SECOND def datetime_to_timestamp_string(dt): - """Convert naive UTC datetime to timestamp string""" + """Convert naive UTC datetime to Unix timestamp string""" try: return str((dt - EPOCH) // SECOND) except Exception: return "" +if sys.hexversion < 0x30c0000: + # Python <= 3.11 + datetime_utcfromtimestamp = datetime.datetime.utcfromtimestamp + datetime_utcnow = datetime.datetime.utcnow + datetime_from_timestamp = datetime_utcfromtimestamp +else: + # Python >= 3.12 + def datetime_from_timestamp(ts=None): + """Convert Unix timestamp to naive UTC datetime""" + Y, m, d, H, M, S, _, _, _ = time.gmtime(ts) + return datetime.datetime(Y, m, d, H, M, S) + + datetime_utcfromtimestamp = datetime_from_timestamp + datetime_utcnow = datetime_from_timestamp + + def json_default(obj): if isinstance(obj, CustomNone): return None @@ -516,6 +532,15 @@ class LazyPrompt(): return getpass.getpass() +class NullContext(): + + def __enter__(self): + return None + + def __exit__(self, exc_type, exc_value, traceback): + pass + + class CustomNone(): """None-style type that supports more operations than regular None""" __slots__ = () @@ -760,8 +785,9 @@ def build_extractor_filter(categories, negate=True, special=None): if catsub: def test(extr): for category, subcategory in catsub: - if category in (extr.category, extr.basecategory) and \ - subcategory == extr.subcategory: + if subcategory == extr.subcategory and ( + category == extr.category or + category == extr.basecategory): return not negate return negate tests.append(test) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 0f9f91b..513da41 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.27.4" +__version__ = "1.27.5" __variant__ = None |
