diff options
Diffstat (limited to 'gallery_dl')
39 files changed, 812 insertions, 277 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 4b39c15..663fe99 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -238,6 +238,13 @@ def main(): return config.open_extern() else: + input_files = config.get((), "input-files") + if input_files: + for input_file in input_files: + if isinstance(input_file, str): + input_file = (input_file, None) + args.input_files.append(input_file) + if not args.urls and not args.input_files: parser.error( "The following arguments are required: URL\n" diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index f017929..deb7c7b 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -179,11 +179,14 @@ def _firefox_cookies_database(profile=None, container=None): "{}".format(search_root)) _log_debug("Extracting cookies from %s", path) - if container == "none": + if not container or container == "none": container_id = False _log_debug("Only loading cookies not belonging to any container") - elif container: + elif container == "all": + container_id = None + + else: containers_path = os.path.join( os.path.dirname(path), "containers.json") @@ -207,8 +210,6 @@ def _firefox_cookies_database(profile=None, container=None): container)) _log_debug("Only loading cookies from container '%s' (ID %s)", container, container_id) - else: - container_id = None return path, container_id diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 87e7756..b3bec21 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -42,8 +42,9 @@ class YoutubeDLDownloader(DownloaderBase): if not ytdl_instance: try: module = ytdl.import_module(self.config("module")) - except ImportError as exc: - self.log.error("Cannot import module '%s'", exc.name) + except (ImportError, SyntaxError) as exc: + self.log.error("Cannot import module '%s'", + getattr(exc, "name", "")) self.log.debug("", exc_info=True) self.download = lambda u, p: False return False diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 2adb142..786acd9 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -51,28 +51,29 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): if not manga: manga = extr('link-hover">', "<") info = text.remove_html(extr('link-hover">', "</")) + info = text.unescape(info) match = re.match( - r"(?:Volume\s+(\d+) )?" - r"\w+\s+(\d+)(.*)", info) + r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?" + r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info) if match: volume, chapter, minor = match.groups() - title = text.remove_html(extr( - "selected>", "</option")).partition(" : ")[2] else: volume = chapter = 0 minor = "" - title = info return { - "manga" : text.unescape(manga), - "manga_id" : text.parse_int(manga_id), - "title" : text.unescape(title), - "volume" : text.parse_int(volume), - "chapter" : text.parse_int(chapter), - "chapter_minor": minor, - "chapter_id" : text.parse_int(self.chapter_id), - "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), + "manga" : text.unescape(manga), + "manga_id" : text.parse_int(manga_id), + "chapter_url" : extr(self.chapter_id + "-ch_", '"'), + "title" : text.unescape(text.remove_html(extr( + "selected>", "</option")).partition(" : ")[2]), + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor" : minor, + "chapter_string": info, + "chapter_id" : text.parse_int(self.chapter_id), + "date" : text.parse_timestamp(extr(' time="', '"')[:-3]), } def images(self, page): diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 240bbd3..780bdf1 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,15 +6,24 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.sk/""" +"""Extractors for https://bunkr.si/""" from .lolisafe import LolisafeAlbumExtractor -from .. import text - -BASE_PATTERN = ( - r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|ru|la|is|to|ac|black|cat|media|red|site|ws|org))" -) +from .. import text, config + + +if config.get(("extractor", "bunkr"), "tlds"): + BASE_PATTERN = ( + r"(?:bunkr:(?:https?://)?([^/?#]+)|" + r"(?:https?://)?(?:app\.)?(bunkr+\.\w+))" + ) +else: + BASE_PATTERN = ( + r"(?:bunkr:(?:https?://)?([^/?#]+)|" + r"(?:https?://)?(?:app\.)?(bunkr+" + r"\.(?:s[kiu]|[cf]i|ru|la|is|to|a[cx]" + r"|black|cat|media|red|site|ws|org)))" + ) LEGACY_DOMAINS = { "bunkr.ru", @@ -28,15 +37,15 @@ LEGACY_DOMAINS = { class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.sk albums""" + """Extractor for bunkr.si albums""" category = "bunkr" - root = "https://bunkr.sk" + root = "https://bunkr.si" pattern = BASE_PATTERN + r"/a/([^/?#]+)" - example = "https://bunkr.sk/a/ID" + example = "https://bunkr.si/a/ID" def __init__(self, match): LolisafeAlbumExtractor.__init__(self, match) - domain = match.group(match.lastindex-1) + domain = self.groups[0] or self.groups[1] if domain not in LEGACY_DOMAINS: self.root = "https://" + domain @@ -69,11 +78,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): def _extract_file(self, url): page = self.request(url).text - return ( - text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"') or - text.rextract(page, ' href="', '"', page.rindex("Download"))[0] - ) + url = (text.extr(page, '<source src="', '"') or + text.extr(page, '<img src="', '"')) + + if not url: + url_download = text.rextract( + page, ' href="', '"', page.rindex("Download"))[0] + page = self.request(text.unescape(url_download)).text + url = text.unescape(text.rextract(page, ' href="', '"')[0]) + + return url def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): @@ -83,11 +97,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): class BunkrMediaExtractor(BunkrAlbumExtractor): - """Extractor for bunkr.sk media links""" + """Extractor for bunkr.si media links""" subcategory = "media" directory_fmt = ("{category}",) pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)" - example = "https://bunkr.sk/v/FILENAME" + example = "https://bunkr.si/v/FILENAME" def fetch_album(self, album_id): try: diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index d864960..a514696 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -14,6 +14,7 @@ from .. import text class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): category = "cyberdrop" root = "https://cyberdrop.me" + root_api = "https://api.cyberdrop.me" pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" @@ -55,5 +56,14 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): def _extract_files(self, file_ids): for file_id in file_ids: - url = "{}/api/f/{}".format(self.root, file_id) - yield self.request(url).json() + try: + url = "{}/api/file/info/{}".format(self.root_api, file_id) + file = self.request(url).json() + auth = self.request(file["auth_url"]).json() + file["url"] = auth["url"] + except Exception as exc: + self.log.warning("%s (%s: %s)", + file_id, exc.__class__.__name__, exc) + continue + + yield file diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index f3ea4e7..ea70b58 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -69,11 +69,12 @@ class DeviantartExtractor(Extractor): self.quality = ",q_{}".format(self.quality) self.quality_sub = re.compile(r",q_\d+").sub - if self.original != "image": - self._update_content = self._update_content_default - else: - self._update_content = self._update_content_image + if isinstance(self.original, str) and \ + self.original.lower().startswith("image"): self.original = True + self._update_content = self._update_content_image + else: + self._update_content = self._update_content_default journals = self.config("journals", "html") if journals == "html": @@ -1462,6 +1463,8 @@ class DeviantartOAuthAPI(): return if "next_cursor" in data: + if not data["next_cursor"]: + return params["offset"] = None params["cursor"] = data["next_cursor"] elif data["next_offset"] is not None: diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index af963bc..553ec22 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -10,6 +10,7 @@ from .common import Message from . import danbooru +from ..cache import memcache from .. import text, util @@ -44,16 +45,11 @@ class E621Extractor(danbooru.DanbooruExtractor): self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]) if notes and post.get("has_notes"): - url = "{}/notes.json?search[post_id]={}".format( - self.root, post["id"]) - post["notes"] = self.request(url).json() + post["notes"] = self._get_notes(post["id"]) if pools and post["pools"]: - url = "{}/pools.json?search[id]={}".format( - self.root, ",".join(map(str, post["pools"]))) - post["pools"] = _pools = self.request(url).json() - for pool in _pools: - pool["name"] = pool["name"].replace("_", " ") + post["pools"] = self._get_pools( + ",".join(map(str, post["pools"]))) post["filename"] = file["md5"] post["extension"] = file["ext"] @@ -64,6 +60,18 @@ class E621Extractor(danbooru.DanbooruExtractor): yield Message.Directory, post yield Message.Url, file["url"], post + def _get_notes(self, id): + return self.request( + "{}/notes.json?search[post_id]={}".format(self.root, id)).json() + + @memcache(keyarg=1) + def _get_pools(self, ids): + pools = self.request( + "{}/pools.json?search[id]={}".format(self.root, ids)).json() + for pool in pools: + pool["name"] = pool["name"].replace("_", " ") + return pools + BASE_PATTERN = E621Extractor.update({ "e621": { diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 1b4f995..01af7a4 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -430,7 +430,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): } page = self.request(url, cookies=cookies).text - current = text.extr(page, "<strong>", "</strong>") + current = text.extr(page, "<strong>", "</strong>").replace(",", "") self.log.debug("Image Limits: %s/%s", current, self.limits) self._remaining = self.limits - text.parse_int(current) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index c94a110..1b4971c 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -75,11 +75,8 @@ class FlickrImageExtractor(FlickrExtractor): def items(self): photo = self.api.photos_getInfo(self.item_id) - if self.api.exif: - photo.update(self.api.photos_getExif(self.item_id)) - if self.api.contexts: - photo.update(self.api.photos_getAllContexts(self.item_id)) + self.api._extract_metadata(photo) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) else: @@ -135,8 +132,13 @@ class FlickrAlbumExtractor(FlickrExtractor): def metadata(self): data = FlickrExtractor.metadata(self) - data["album"] = self.api.photosets_getInfo( - self.album_id, self.user["nsid"]) + try: + data["album"] = self.api.photosets_getInfo( + self.album_id, self.user["nsid"]) + except Exception: + data["album"] = {} + self.log.warning("%s: Unable to retrieve album metadata", + self.album_id) return data def photos(self): @@ -407,6 +409,8 @@ class FlickrAPI(oauth.OAuth1API): self.log.debug("Server response: %s", data) if data["code"] == 1: raise exception.NotFoundError(self.extractor.subcategory) + elif data["code"] == 2: + raise exception.AuthorizationError(msg) elif data["code"] == 98: raise exception.AuthenticationError(msg) elif data["code"] == 99: @@ -453,10 +457,7 @@ class FlickrAPI(oauth.OAuth1API): photo["date"] = text.parse_timestamp(photo["dateupload"]) photo["tags"] = photo["tags"].split() - if self.exif: - photo.update(self.photos_getExif(photo["id"])) - if self.contexts: - photo.update(self.photos_getAllContexts(photo["id"])) + self._extract_metadata(photo) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: @@ -512,6 +513,23 @@ class FlickrAPI(oauth.OAuth1API): photo["width"] = photo["height"] = 0 return photo + def _extract_metadata(self, photo): + if self.exif: + try: + photo.update(self.photos_getExif(photo["id"])) + except Exception as exc: + self.log.warning( + "Unable to retrieve 'exif' data for %s (%s: %s)", + photo["id"], exc.__class__.__name__, exc) + + if self.contexts: + try: + photo.update(self.photos_getAllContexts(photo["id"])) + except Exception as exc: + self.log.warning( + "Unable to retrieve 'contexts' data for %s (%s: %s)", + photo["id"], exc.__class__.__name__, exc) + @staticmethod def _clean_info(info): info["title"] = info["title"]["_content"] diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 3055426..d253582 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -179,6 +179,11 @@ class FuraffinityExtractor(Extractor): break self._favorite_id = text.parse_int(extr('data-fav-id="', '"')) yield post_id + + pos = page.find('type="submit">Next</button>') + if pos >= 0: + path = text.rextract(page, '<form action="', '"', pos)[0] + continue path = text.extr(page, 'right" href="', '"') def _pagination_search(self, query): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 16d4340..a6c1d5a 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -15,7 +15,7 @@ import re class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" category = "generic" - directory_fmt = ("{category}", "{pageurl}") + directory_fmt = ("{category}", "{subcategory}", "{path}") archive_fmt = "{imageurl}" # By default, the generic extractor is disabled @@ -52,7 +52,10 @@ class GenericExtractor(Extractor): self.scheme = match.group('scheme') else: self.scheme = 'https://' - self.url = self.scheme + self.url + self.url = text.ensure_http_scheme(self.url, self.scheme) + + self.subcategory = match.group('domain') + self.path = match.group('path') # Used to resolve relative image urls self.root = self.scheme + match.group('domain') @@ -87,6 +90,7 @@ class GenericExtractor(Extractor): def metadata(self, page): """Extract generic webpage metadata, return them in a dict.""" data = {} + data['path'] = self.path.replace("/", "") data['pageurl'] = self.url data['title'] = text.extr(page, '<title>', "</title>") data['description'] = text.extr( diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index f0eb4e9..52b4ae6 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -47,8 +47,7 @@ class GofileFolderExtractor(Extractor): raise exception.AuthorizationError("Password required") num = 0 - for content_id in folder["childrenIds"]: - content = contents[content_id] + for content in contents.values(): content["folder"] = folder if content["type"] == "file": diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 9b74700..18df9df 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -89,6 +89,7 @@ class HitomiGalleryExtractor(GalleryExtractor): path = ext = "webp" ihash = image["hash"] idata = text.nameext_from_url(image["name"]) + idata["extension_original"] = idata["extension"] if ext: idata["extension"] = ext diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index c05fe72..422c865 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -12,6 +12,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache +import itertools import binascii import json import re @@ -57,12 +58,17 @@ class InstagramExtractor(Extractor): data = self.metadata() videos = self.config("videos", True) previews = self.config("previews", False) + max_posts = self.config("max-posts") video_headers = {"User-Agent": "Mozilla/5.0"} order = self.config("order-files") reverse = order[0] in ("r", "d") if order else False - for post in self.posts(): + posts = self.posts() + if max_posts: + posts = itertools.islice(posts, max_posts) + + for post in posts: if "__typename" in post: post = self._parse_post_graphql(post) @@ -159,15 +165,19 @@ class InstagramExtractor(Extractor): if "title" in post: data["highlight_title"] = post["title"] if "created_at" in post: - data["date"] = text.parse_timestamp(post.get("created_at")) + data["post_date"] = data["date"] = text.parse_timestamp( + post.get("created_at")) else: # regular image/video post + date = text.parse_timestamp(post.get("taken_at")) data = { "post_id" : post["pk"], "post_shortcode": post["code"], + "post_url": "{}/p/{}/".format(self.root, post["code"]), + "post_date": date, + "date": date, "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), - "date": text.parse_timestamp(post.get("taken_at")), "liked": post.get("has_liked", False), } @@ -206,7 +216,6 @@ class InstagramExtractor(Extractor): data["owner_id"] = owner["pk"] data["username"] = owner.get("username") data["fullname"] = owner.get("full_name") - data["post_url"] = "{}/p/{}/".format(self.root, data["post_shortcode"]) data["_files"] = files = [] for num, item in enumerate(items, 1): @@ -269,7 +278,6 @@ class InstagramExtractor(Extractor): owner = post["owner"] data = { "typename" : typename, - "date" : text.parse_timestamp(post["taken_at_timestamp"]), "likes" : post["edge_media_preview_like"]["count"], "liked" : post.get("viewer_has_liked", False), "pinned" : pinned, @@ -279,11 +287,13 @@ class InstagramExtractor(Extractor): "post_id" : post["id"], "post_shortcode": post["shortcode"], "post_url" : "{}/p/{}/".format(self.root, post["shortcode"]), + "post_date" : text.parse_timestamp(post["taken_at_timestamp"]), "description": text.parse_unicode_escapes("\n".join( edge["node"]["text"] for edge in post["edge_media_to_caption"]["edges"] )), } + data["date"] = data["post_date"] tags = self._find_tags(data["description"]) if tags: @@ -313,6 +323,7 @@ class InstagramExtractor(Extractor): media = { "num": num, "media_id" : node["id"], + "date" : data["date"], "shortcode" : (node.get("shortcode") or shortcode_from_id(node["id"])), "display_url": node["display_url"], @@ -328,6 +339,7 @@ class InstagramExtractor(Extractor): dimensions = post["dimensions"] media = { "media_id" : post["id"], + "date" : data["date"], "shortcode" : post["shortcode"], "display_url": post["display_url"], "video_url" : post.get("video_url"), @@ -378,7 +390,11 @@ class InstagramExtractor(Extractor): "full_name": user["full_name"]}) def _init_cursor(self): - return self.config("cursor") or None + cursor = self.config("cursor", True) + if not cursor: + self._update_cursor = util.identity + elif isinstance(cursor, str): + return cursor def _update_cursor(self, cursor): self.log.debug("Cursor: %s", cursor) @@ -418,6 +434,7 @@ class InstagramUserExtractor(InstagramExtractor): base = "{}/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item) return self._dispatch_extractors(( + (InstagramInfoExtractor , base + "info/"), (InstagramAvatarExtractor , base + "avatar/"), (InstagramStoriesExtractor , stories), (InstagramHighlightsExtractor, base + "highlights/"), diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py index 979b1a2..cacf504 100644 --- a/gallery_dl/extractor/koharu.py +++ b/gallery_dl/extractor/koharu.py @@ -161,16 +161,29 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): return results def _select_format(self, formats): - if not self.fmt or self.fmt == "original": - fmtid = "0" + fmt = self.fmt + + if not fmt or fmt == "best": + fmtids = ("0", "1600", "1280", "980", "780") + elif isinstance(fmt, str): + fmtids = fmt.split(",") + elif isinstance(fmt, list): + fmtids = fmt else: - fmtid = str(self.fmt) + fmtids = (str(self.fmt),) - try: - fmt = formats[fmtid] - except KeyError: + for fmtid in fmtids: + try: + fmt = formats[fmtid] + if fmt["id"]: + break + except KeyError: + self.log.debug("%s: Format %s is not available", + self.groups[0], fmtid) + else: raise exception.NotFoundError("format") + self.log.debug("%s: Selected format %s", self.groups[0], fmtid) fmt["w"] = fmtid return fmt diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 3d7d685..117b88b 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -34,7 +34,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor): def __init__(self, match): LolisafeExtractor.__init__(self, match) - self.album_id = match.group(match.lastindex) + self.album_id = self.groups[-1] def _init(self): domain = self.config("domain") diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index ecd6619..5fc0ce5 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -171,15 +171,17 @@ class NewgroundsExtractor(Extractor): if self.flash: url += "/format/flash" - with self.request(url, fatal=False) as response: - if response.status_code >= 400: - return {} - page = response.text + response = self.request(url, fatal=False) + page = response.text pos = page.find('id="adults_only"') if pos >= 0: msg = text.extract(page, 'class="highlight">', '<', pos)[0] self.log.warning('"%s"', msg) + return {} + + if response.status_code >= 400: + return {} extr = text.extract_from(page) data = extract_data(extr, post_url) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index d732894..3479b88 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -94,12 +94,39 @@ class PixivExtractor(Extractor): work.get("id"), exc.message) continue - url = ugoira["zip_urls"]["medium"].replace( - "_ugoira600x600", "_ugoira1920x1080") - work["frames"] = ugoira["frames"] + url = ugoira["zip_urls"]["medium"] + work["frames"] = frames = ugoira["frames"] work["date_url"] = self._date_from_url(url) work["_http_adjust_extension"] = False - yield Message.Url, url, text.nameext_from_url(url, work) + + if self.load_ugoira == "original": + base, sep, _ = url.rpartition("_ugoira") + base = base.replace( + "/img-zip-ugoira/", "/img-original/", 1) + sep + + for ext in ("jpg", "png", "gif"): + try: + url = ("{}0.{}".format(base, ext)) + self.request(url, method="HEAD") + break + except exception.HttpError: + pass + else: + self.log.warning( + "Unable to find Ugoira frame URLs (%s)", + work.get("id")) + continue + + for num, frame in enumerate(frames): + url = ("{}{}.{}".format(base, num, ext)) + work["num"] = work["_ugoira_frame_index"] = num + work["suffix"] = "_p{:02}".format(num) + text.nameext_from_url(url, work) + yield Message.Url, url, work + + else: + url = url.replace("_ugoira600x600", "_ugoira1920x1080") + yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] @@ -551,9 +578,6 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" - cookies_domain = ".pixiv.net" - browser = "firefox" - tls12 = False pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" example = "https://www.pixiv.net/user/12345/series/12345" @@ -562,34 +586,18 @@ class PixivSeriesExtractor(PixivExtractor): self.user_id, self.series_id = match.groups() def works(self): - url = self.root + "/ajax/series/" + self.series_id - params = {"p": 1} - headers = { - "Accept": "application/json", - "Referer": "{}/user/{}/series/{}".format( - self.root, self.user_id, self.series_id), - "Alt-Used": "www.pixiv.net", - } + series = None - while True: - data = self.request(url, params=params, headers=headers).json() - body = data["body"] - page = body["page"] - - series = body["extraData"]["meta"] - series["id"] = self.series_id - series["total"] = page["total"] - series["title"] = text.extr(series["title"], '"', '"') - - for info in page["series"]: - work = self.api.illust_detail(info["workId"]) - work["num_series"] = info["order"] - work["series"] = series - yield work - - if len(page["series"]) < 10: - return - params["p"] += 1 + for work in self.api.illust_series(self.series_id): + if series is None: + series = self.api.data + series["total"] = num_series = series.pop("series_work_count") + else: + num_series -= 1 + + work["num_series"] = num_series + work["series"] = series + yield work class PixivNovelExtractor(PixivExtractor): @@ -916,6 +924,11 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._pagination("/v2/illust/related", params) + def illust_series(self, series_id, offset=0): + params = {"illust_series_id": series_id, "offset": offset} + return self._pagination("/v1/illust/series", params, + key_data="illust_series_detail") + def novel_bookmark_detail(self, novel_id): params = {"novel_id": novel_id} return self._call( @@ -1013,10 +1026,15 @@ class PixivAppAPI(): raise exception.StopExtraction("API request failed: %s", error) - def _pagination(self, endpoint, params, key="illusts"): + def _pagination(self, endpoint, params, + key_items="illusts", key_data=None): while True: data = self._call(endpoint, params) - yield from data[key] + + if key_data: + self.data = data.get(key_data) + key_data = None + yield from data[key_items] if not data["next_url"]: return diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index ad3efa7..7db8172 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -66,7 +66,8 @@ class SankakuExtractor(BooruExtractor): def _prepare(self, post): post["created_at"] = post["created_at"]["s"] post["date"] = text.parse_timestamp(post["created_at"]) - post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] + post["tags"] = [tag["name"].lower().replace(" ", "_") + for tag in post["tags"] if tag["name"]] post["tag_string"] = " ".join(post["tags"]) post["_http_validate"] = self._check_expired @@ -79,7 +80,7 @@ class SankakuExtractor(BooruExtractor): for tag in post["tags"]: name = tag["name"] if name: - tags[types[tag["type"]]].append(name) + tags[types[tag["type"]]].append(name.lower().replace(" ", "_")) for key, value in tags.items(): post["tags_" + key] = value post["tag_string_" + key] = " ".join(value) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 80f2aea..7708b5c 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -152,6 +152,25 @@ class SexcomPinsExtractor(SexcomExtractor): return self._pagination(url) +class SexcomLikesExtractor(SexcomExtractor): + """Extractor for a user's liked pins on www.sex.com""" + subcategory = "likes" + directory_fmt = ("{category}", "{user}", "Likes") + pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/" + example = "https://www.sex.com/user/USER/likes/" + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.user = match.group(1) + + def metadata(self): + return {"user": text.unquote(self.user)} + + def pins(self): + url = "{}/user/{}/likes/".format(self.root, self.user) + return self._pagination(url) + + class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index bba1ece..b6917cc 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -86,6 +86,7 @@ BASE_PATTERN = SzurubooruExtractor.update({ "bcbnsfw": { "root": "https://booru.bcbnsfw.space", "pattern": r"booru\.bcbnsfw\.space", + "query-all": "*", }, "snootbooru": { "root": "https://snootbooru.com", @@ -110,7 +111,12 @@ class SzurubooruTagExtractor(SzurubooruExtractor): return {"search_tags": self.query} def posts(self): - return self._pagination("/posts/", {"query": self.query}) + if self.query.strip(): + query = self.query + else: + query = self.config_instance("query-all") + + return self._pagination("/posts/", {"query": query}) class SzurubooruPostExtractor(SzurubooruExtractor): diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 64fa951..44d87ee 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -123,4 +123,5 @@ class ToyhouseImageExtractor(ToyhouseExtractor): def posts(self): url = "{}/~images/{}".format(self.root, self.user) - return (self._parse_post(self.request(url).text, '<img src="'),) + return (self._parse_post( + self.request(url).text, '<img class="mw-100" src="'),) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index ff29c04..73455d2 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -400,6 +400,9 @@ class TumblrAPI(oauth.OAuth1API): """Retrieve liked posts""" endpoint = "/v2/blog/{}/likes".format(blog) params = {"limit": "50", "before": self.before} + if self.api_key: + params["api_key"] = self.api_key + while True: posts = self._call(endpoint, params)["liked_posts"] if not posts: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ea57d76..d4ec343 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -155,6 +155,7 @@ class TwitterExtractor(Extractor): if not self.unavailable: continue + mtype = media.get("type") descr = media.get("ext_alt_text") width = media["original_info"].get("width", 0) height = media["original_info"].get("height", 0) @@ -164,6 +165,7 @@ class TwitterExtractor(Extractor): files.append({ "url": "ytdl:{}/i/web/status/{}".format( self.root, tweet["id_str"]), + "type" : mtype, "width" : width, "height" : height, "extension" : None, @@ -177,6 +179,7 @@ class TwitterExtractor(Extractor): ) files.append({ "url" : variant["url"], + "type" : mtype, "width" : width, "height" : height, "bitrate" : variant.get("bitrate", 0), @@ -193,6 +196,7 @@ class TwitterExtractor(Extractor): base = url.rpartition("=")[0] + "=" files.append(text.nameext_from_url(url, { "url" : base + self._size_image, + "type" : mtype, "width" : width, "height" : height, "_fallback" : self._image_fallback(base), @@ -504,7 +508,11 @@ class TwitterExtractor(Extractor): } def _init_cursor(self): - return self.config("cursor") or None + cursor = self.config("cursor", True) + if not cursor: + self._update_cursor = util.identity + elif isinstance(cursor, str): + return cursor def _update_cursor(self, cursor): self.log.debug("Cursor: %s", cursor) @@ -560,6 +568,7 @@ class TwitterUserExtractor(TwitterExtractor): def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( + (TwitterInfoExtractor , base + "info"), (TwitterAvatarExtractor , base + "photo"), (TwitterBackgroundExtractor, base + "header_photo"), (TwitterTimelineExtractor , base + "timeline"), @@ -590,9 +599,16 @@ class TwitterTimelineExtractor(TwitterExtractor): return cursor def tweets(self): - self._cursor = cursor = self.config("cursor") or None reset = False + cursor = self.config("cursor", True) + if not cursor: + self._update_cursor = util.identity + elif isinstance(cursor, str): + self._cursor = cursor + else: + cursor = None + if cursor: state = cursor.partition("/")[0] state, _, tweet_id = state.partition("_") @@ -1612,6 +1628,9 @@ class TwitterAPI(): entries = instr["entries"] elif instr_type == "TimelineAddToModule": entries = instr["moduleItems"] + elif instr_type == "TimelinePinEntry": + if pinned_tweet: + pinned_tweet = instr["entry"] elif instr_type == "TimelineReplaceEntry": entry = instr["entry"] if entry["entryId"].startswith("cursor-bottom-"): @@ -1650,9 +1669,11 @@ class TwitterAPI(): tweet = None if pinned_tweet: - pinned_tweet = False - if instructions[-1]["type"] == "TimelinePinEntry": + if isinstance(pinned_tweet, dict): + tweets.append(pinned_tweet) + elif instructions[-1]["type"] == "TimelinePinEntry": tweets.append(instructions[-1]["entry"]) + pinned_tweet = False for entry in entries: esw = entry["entryId"].startswith diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 9370cfb..7a62e01 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor): """Base class for wikimedia extractors""" basecategory = "wikimedia" filename_fmt = "{filename} ({sha1[:8]}).{extension}" - directory_fmt = ("{category}", "{page}") archive_fmt = "{sha1}" request_interval = (1.0, 2.0) def __init__(self, match): BaseExtractor.__init__(self, match) - path = match.group(match.lastindex) if self.category == "wikimedia": self.category = self.root.split(".")[-2] @@ -31,31 +29,7 @@ class WikimediaExtractor(BaseExtractor): self.category = "{}-{}".format( self.category, self.root.partition(".")[0].rpartition("/")[2]) - if path.startswith("wiki/"): - path = path[5:] - - pre, sep, _ = path.partition(":") - prefix = pre.lower() if sep else None - - self.title = path = text.unquote(path) - if prefix: - self.subcategory = prefix - - if prefix == "category": - self.params = { - "generator": "categorymembers", - "gcmtitle" : path, - "gcmtype" : "file", - } - elif prefix == "file": - self.params = { - "titles" : path, - } - else: - self.params = { - "generator": "images", - "titles" : path, - } + self.per_page = self.config("limit", 50) def _init(self): api_path = self.config_instance("api-path") @@ -67,6 +41,22 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = self.root + "/api.php" + @staticmethod + def prepare(image): + """Adjust the content of a image object""" + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"] or ()} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"] or ()} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + def items(self): for info in self._pagination(self.params): try: @@ -75,20 +65,7 @@ class WikimediaExtractor(BaseExtractor): self.log.debug("Missing 'imageinfo' for %s", info) continue - image["metadata"] = { - m["name"]: m["value"] - for m in image["metadata"] or ()} - image["commonmetadata"] = { - m["name"]: m["value"] - for m in image["commonmetadata"] or ()} - - filename = image["canonicaltitle"] - image["filename"], _, image["extension"] = \ - filename.partition(":")[2].rpartition(".") - image["date"] = text.parse_datetime( - image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") - image["page"] = self.title - + self.prepare(image) yield Message.Directory, image yield Message.Url, image["url"], image @@ -110,6 +87,17 @@ class WikimediaExtractor(BaseExtractor): while True: data = self.request(url, params=params).json() + # ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings + error = data.get("error") + if error: + self.log.error("%s: %s", error["code"], error["info"]) + return + # MediaWiki will emit warnings for non-fatal mistakes such as + # invalid parameter instead of raising an error + warnings = data.get("warnings") + if warnings: + self.log.debug("MediaWiki returned warnings: %s", warnings) + try: pages = data["query"]["pages"] except KeyError: @@ -181,5 +169,59 @@ BASE_PATTERN = WikimediaExtractor.update({ class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" + directory_fmt = ("{category}", "{page}") pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + path = match.group(match.lastindex) + if path.startswith("wiki/"): + path = path[5:] + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + if prefix: + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + "gcmlimit" : self.per_page, + } + elif prefix == "file": + self.params = { + "titles" : path, + } + else: + self.params = { + "generator": "images", + "gimlimit" : self.per_page, + "titles" : path, + } + + def prepare(self, image): + WikimediaExtractor.prepare(image) + image["page"] = self.title + + +class WikimediaWikiExtractor(WikimediaExtractor): + """Extractor for all files on a MediaWiki instance""" + subcategory = "wiki" + pattern = BASE_PATTERN + r"/?$" + example = "https://en.wikipedia.org/" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + # ref: https://www.mediawiki.org/wiki/API:Allpages + self.params = { + "generator" : "allpages", + "gapnamespace": 6, # "File" namespace + "gaplimit" : self.per_page, + } diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index cb3c74c..168845e 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -116,21 +116,20 @@ class YoutubeDLExtractor(Extractor): for entry in entries: if not entry: continue - elif entry.get("_type") in ("url", "url_transparent"): + + if entry.get("_type") in ("url", "url_transparent"): try: - info_dict = ytdl_instance.extract_info( + entry = ytdl_instance.extract_info( entry["url"], False, ie_key=entry.get("ie_key")) except ytdl_module.utils.YoutubeDLError: continue - - if not info_dict: + if not entry: continue - elif "entries" in info_dict: - yield from self._process_entries( - ytdl_module, ytdl_instance, info_dict["entries"]) - else: - yield info_dict + + if "entries" in entry: + yield from self._process_entries( + ytdl_module, ytdl_instance, entry["entries"]) else: yield entry diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index ec1c926..f197e5d 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -325,6 +325,23 @@ def _parse_slice(format_spec, default): return apply_slice +def _parse_arithmetic(format_spec, default): + op, _, format_spec = format_spec.partition(_SEPARATOR) + fmt = _build_format_func(format_spec, default) + + value = int(op[2:]) + op = op[1] + + if op == "+": + return lambda obj: fmt(obj + value) + if op == "-": + return lambda obj: fmt(obj - value) + if op == "*": + return lambda obj: fmt(obj * value) + + return fmt + + def _parse_conversion(format_spec, default): conversions, _, format_spec = format_spec.partition(_SEPARATOR) convs = [_CONVERSIONS[c] for c in conversions[1:]] @@ -480,6 +497,7 @@ _CONVERSIONS = { _FORMAT_SPECIFIERS = { "?": _parse_optional, "[": _parse_slice, + "A": _parse_arithmetic, "C": _parse_conversion, "D": _parse_datetime, "J": _parse_join, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 0e0916d..c995767 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -322,6 +322,12 @@ class DownloadJob(Job): for callback in hooks["prepare-after"]: callback(pathfmt) + if kwdict.pop("_file_recheck", False) and pathfmt.exists(): + if archive and self._archive_write_skip: + archive.add(kwdict) + self.handle_skip() + return + if self.sleep: self.extractor.sleep(self.sleep(), "download") @@ -474,10 +480,11 @@ class DownloadJob(Job): def handle_skip(self): pathfmt = self.pathfmt - self.out.skip(pathfmt.path) if "skip" in self.hooks: for callback in self.hooks["skip"]: callback(pathfmt) + self.out.skip(pathfmt.path) + if self._skipexc: if not self._skipftr or self._skipftr(pathfmt.kwdict): self._skipcnt += 1 diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 155cbd9..0189c0e 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -74,6 +74,21 @@ class MtimeAction(argparse.Action): }) +class RenameAction(argparse.Action): + """Configure rename post processors""" + def __call__(self, parser, namespace, value, option_string=None): + if self.const: + namespace.postprocessors.append({ + "name": "rename", + "to" : value, + }) + else: + namespace.postprocessors.append({ + "name": "rename", + "from": value, + }) + + class UgoiraAction(argparse.Action): """Configure ugoira post processors""" def __call__(self, parser, namespace, value, option_string=None): @@ -128,7 +143,7 @@ class UgoiraAction(argparse.Action): pp["name"] = "ugoira" pp["whitelist"] = ("pixiv", "danbooru") - namespace.options.append(((), "ugoira", True)) + namespace.options.append((("extractor",), "ugoira", True)) namespace.postprocessors.append(pp) @@ -207,7 +222,7 @@ def build_parser(): ) update = parser.add_argument_group("Update Options") - if util.EXECUTABLE or 1: + if util.EXECUTABLE: update.add_argument( "-U", "--update", dest="update", action="store_const", const="latest", @@ -526,7 +541,8 @@ def build_parser(): "domain prefixed with '/', " "keyring name prefixed with '+', " "profile prefixed with ':', and " - "container prefixed with '::' ('none' for no container)"), + "container prefixed with '::' " + "('none' for no container (default), 'all' for all containers)"), ) selection = parser.add_argument_group("Selection Options") @@ -661,9 +677,21 @@ def build_parser(): help=argparse.SUPPRESS, ) postprocessor.add_argument( + "--rename", + dest="postprocessors", metavar="FORMAT", action=RenameAction, const=0, + help=("Rename previously downloaded files from FORMAT " + "to the current filename format"), + ) + postprocessor.add_argument( + "--rename-to", + dest="postprocessors", metavar="FORMAT", action=RenameAction, const=1, + help=("Rename previously downloaded files from the current filename " + "format to FORMAT"), + ) + postprocessor.add_argument( "--ugoira", - dest="postprocessors", metavar="FORMAT", action=UgoiraAction, - help=("Convert Pixiv Ugoira to FORMAT using FFmpeg. " + dest="postprocessors", metavar="FMT", action=UgoiraAction, + help=("Convert Pixiv Ugoira to FMT using FFmpeg. " "Supported formats are 'webm', 'mp4', 'gif', " "'vp8', 'vp9', 'vp9-lossless', 'copy'."), ) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 7892776..d408a41 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -184,29 +184,31 @@ class PathFormat(): def set_directory(self, kwdict): """Build directory path and create it if necessary""" self.kwdict = kwdict - sep = os.sep segments = self.build_directory(kwdict) if segments: self.directory = directory = self.basedirectory + self.clean_path( - sep.join(segments) + sep) + os.sep.join(segments) + os.sep) else: self.directory = directory = self.basedirectory if WINDOWS and self.extended: - # Enable longer-than-260-character paths - directory = os.path.abspath(directory) - if directory.startswith("\\\\"): - directory = "\\\\?\\UNC\\" + directory[2:] - else: - directory = "\\\\?\\" + directory - - # abspath() in Python 3.7+ removes trailing path separators (#402) - if directory[-1] != sep: - directory += sep - + directory = self._extended_path(directory) self.realdirectory = directory + def _extended_path(self, path): + # Enable longer-than-260-character paths + path = os.path.abspath(path) + if not path.startswith("\\\\"): + path = "\\\\?\\" + path + elif not path.startswith("\\\\?\\"): + path = "\\\\?\\UNC\\" + path[2:] + + # abspath() in Python 3.7+ removes trailing path separators (#402) + if path[-1] != os.sep: + return path + os.sep + return path + def set_filename(self, kwdict): """Set general filename data""" self.kwdict = kwdict diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index 4690554..7837b06 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -12,9 +12,11 @@ modules = [ "classify", "compare", "exec", + "hash", "metadata", "mtime", "python", + "rename", "ugoira", "zip", ] diff --git a/gallery_dl/postprocessor/hash.py b/gallery_dl/postprocessor/hash.py new file mode 100644 index 0000000..92a7477 --- /dev/null +++ b/gallery_dl/postprocessor/hash.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Compute file hash digests""" + +from .common import PostProcessor +import hashlib + + +class HashPP(PostProcessor): + + def __init__(self, job, options): + PostProcessor.__init__(self, job) + + self.chunk_size = options.get("chunk-size", 32768) + self.filename = options.get("filename") + + hashes = options.get("hashes") + if isinstance(hashes, dict): + self.hashes = list(hashes.items()) + elif isinstance(hashes, str): + self.hashes = [] + for h in hashes.split(","): + name, sep, key = h.partition(":") + self.hashes.append((key if sep else name, name)) + elif hashes: + self.hashes = hashes + else: + self.hashes = (("md5", "md5"), ("sha1", "sha1")) + + events = options.get("event") + if events is None: + events = ("file",) + elif isinstance(events, str): + events = events.split(",") + job.register_hooks({event: self.run for event in events}, options) + + def run(self, pathfmt): + hashes = [ + (key, hashlib.new(name)) + for key, name in self.hashes + ] + + size = self.chunk_size + with self._open(pathfmt) as fp: + while True: + data = fp.read(size) + if not data: + break + for _, h in hashes: + h.update(data) + + for key, h in hashes: + pathfmt.kwdict[key] = h.hexdigest() + + if self.filename: + pathfmt.build_path() + + def _open(self, pathfmt): + try: + return open(pathfmt.temppath, "rb") + except OSError: + return open(pathfmt.realpath, "rb") + + +__postprocessor__ = HashPP diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index e89b170..3ef9fbc 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -103,10 +103,10 @@ class MetadataPP(PostProcessor): job.register_hooks({event: self.run for event in events}, options) self._init_archive(job, options, "_MD_") + self.filter = self._make_filter(options) self.mtime = options.get("mtime") self.omode = options.get("open", omode) self.encoding = options.get("encoding", "utf-8") - self.private = options.get("private", False) self.skip = options.get("skip", False) def run(self, pathfmt): @@ -114,7 +114,10 @@ class MetadataPP(PostProcessor): if archive and archive.check(pathfmt.kwdict): return - directory = self._directory(pathfmt) + if util.WINDOWS and pathfmt.extended: + directory = pathfmt._extended_path(self._directory(pathfmt)) + else: + directory = self._directory(pathfmt) path = directory + self._filename(pathfmt) if self.skip and os.path.exists(path): @@ -231,10 +234,33 @@ class MetadataPP(PostProcessor): fp.write("\n".join(tags) + "\n") def _write_json(self, fp, kwdict): - if not self.private: - kwdict = util.filter_dict(kwdict) + if self.filter: + kwdict = self.filter(kwdict) fp.write(self._json_encode(kwdict) + "\n") + def _make_filter(self, options): + include = options.get("include") + if include: + if isinstance(include, str): + include = include.split(",") + return lambda d: {k: d[k] for k in include if k in d} + + exclude = options.get("exclude") + private = options.get("private") + if exclude: + if isinstance(exclude, str): + exclude = exclude.split(",") + exclude = set(exclude) + + if private: + return lambda d: {k: v for k, v in d.items() + if k not in exclude} + return lambda d: {k: v for k, v in util.filter_dict(d).items() + if k not in exclude} + + if not private: + return util.filter_dict + @staticmethod def _make_encoder(options, indent=None): return json.JSONEncoder( diff --git a/gallery_dl/postprocessor/rename.py b/gallery_dl/postprocessor/rename.py new file mode 100644 index 0000000..f71738d --- /dev/null +++ b/gallery_dl/postprocessor/rename.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Rename files""" + +from .common import PostProcessor +from .. import formatter +import os + + +class RenamePP(PostProcessor): + + def __init__(self, job, options): + PostProcessor.__init__(self, job) + + self.skip = options.get("skip", True) + old = options.get("from") + new = options.get("to") + + if old: + self._old = self._apply_format(old) + self._new = (self._apply_format(new) if new else + self._apply_pathfmt) + job.register_hooks({ + "prepare": self.rename_from, + }, options) + + elif new: + self._old = self._apply_pathfmt + self._new = self._apply_format(new) + job.register_hooks({ + "skip" : self.rename_to_skip, + "prepare-after": self.rename_to_pafter, + }, options) + + else: + raise ValueError("Option 'from' or 'to' is required") + + def rename_from(self, pathfmt): + name_old = self._old(pathfmt) + path_old = pathfmt.realdirectory + name_old + + if os.path.exists(path_old): + name_new = self._new(pathfmt) + path_new = pathfmt.realdirectory + name_new + self._rename(path_old, name_old, path_new, name_new) + + def rename_to_skip(self, pathfmt): + name_old = self._old(pathfmt) + path_old = pathfmt.realdirectory + name_old + + if os.path.exists(path_old): + pathfmt.filename = name_new = self._new(pathfmt) + pathfmt.path = pathfmt.directory + name_new + pathfmt.realpath = path_new = pathfmt.realdirectory + name_new + self._rename(path_old, name_old, path_new, name_new) + + def rename_to_pafter(self, pathfmt): + pathfmt.filename = name_new = self._new(pathfmt) + pathfmt.path = pathfmt.directory + name_new + pathfmt.realpath = pathfmt.realdirectory + name_new + pathfmt.kwdict["_file_recheck"] = True + + def _rename(self, path_old, name_old, path_new, name_new): + if self.skip and os.path.exists(path_new): + return self.log.warning( + "Not renaming '%s' to '%s' since another file with the " + "same name exists", name_old, name_new) + + self.log.info("'%s' -> '%s'", name_old, name_new) + os.replace(path_old, path_new) + + def _apply_pathfmt(self, pathfmt): + return pathfmt.build_filename(pathfmt.kwdict) + + def _apply_format(self, format_string): + fmt = formatter.parse(format_string).format_map + + def apply(pathfmt): + return pathfmt.clean_path(pathfmt.clean_segment(fmt( + pathfmt.kwdict))) + + return apply + + +__postprocessor__ = RenamePP diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 9e60ce2..f053afa 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -36,7 +36,8 @@ class UgoiraPP(PostProcessor): self.delete = not options.get("keep-files", False) self.repeat = options.get("repeat-last-frame", True) self.mtime = options.get("mtime", True) - self.uniform = False + self.skip = options.get("skip", True) + self.uniform = self._convert_zip = self._convert_files = False ffmpeg = options.get("ffmpeg-location") self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg" @@ -90,33 +91,44 @@ class UgoiraPP(PostProcessor): if self.prevent_odd: args += ("-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)") - job.register_hooks( - {"prepare": self.prepare, "file": self.convert}, options) + job.register_hooks({ + "prepare": self.prepare, + "file" : self.convert_zip, + "after" : self.convert_files, + }, options) def prepare(self, pathfmt): - self._frames = None - - if pathfmt.extension != "zip": + if "frames" not in pathfmt.kwdict: + self._frames = None return - kwdict = pathfmt.kwdict - if "frames" in kwdict: - self._frames = kwdict["frames"] - elif "pixiv_ugoira_frame_data" in kwdict: - self._frames = kwdict["pixiv_ugoira_frame_data"]["data"] + self._frames = pathfmt.kwdict["frames"] + if pathfmt.extension == "zip": + self._convert_zip = True + if self.delete: + pathfmt.set_extension(self.extension) + pathfmt.build_path() else: - return - - if self.delete: - pathfmt.set_extension(self.extension) pathfmt.build_path() + index = pathfmt.kwdict["_ugoira_frame_index"] + frame = self._frames[index].copy() + frame["index"] = index + frame["path"] = pathfmt.realpath + frame["ext"] = pathfmt.kwdict["extension"] + + if not index: + self._files = [frame] + else: + self._files.append(frame) + if len(self._files) >= len(self._frames): + self._convert_files = True - def convert(self, pathfmt): - if not self._frames: + def convert_zip(self, pathfmt): + if not self._convert_zip: return + self._convert_zip = False with tempfile.TemporaryDirectory() as tempdir: - # extract frames try: with zipfile.ZipFile(pathfmt.temppath) as zfile: zfile.extractall(tempdir) @@ -124,53 +136,89 @@ class UgoiraPP(PostProcessor): pathfmt.realpath = pathfmt.temppath return - # process frames and collect command-line arguments - pathfmt.set_extension(self.extension) - pathfmt.build_path() - - args = self._process(pathfmt, tempdir) - if self.args_pp: - args += self.args_pp - if self.args: - args += self.args - - # ensure target directory exists - os.makedirs(pathfmt.realdirectory, exist_ok=True) - - # invoke ffmpeg - try: - if self.twopass: - if "-f" not in self.args: - args += ("-f", self.extension) - args += ("-passlogfile", tempdir + "/ffmpeg2pass", "-pass") - self._exec(args + ["1", "-y", os.devnull]) - self._exec(args + ["2", pathfmt.realpath]) - else: - args.append(pathfmt.realpath) - self._exec(args) - if self._finalize: - self._finalize(pathfmt, tempdir) - except OSError as exc: - print() - self.log.error("Unable to invoke FFmpeg (%s: %s)", - exc.__class__.__name__, exc) - pathfmt.realpath = pathfmt.temppath - except Exception as exc: - print() - self.log.error("%s: %s", exc.__class__.__name__, exc) - self.log.debug("", exc_info=True) - pathfmt.realpath = pathfmt.temppath - else: - if self.mtime: - mtime = pathfmt.kwdict.get("_mtime") - if mtime: - util.set_mtime(pathfmt.realpath, mtime) + if self.convert(pathfmt, tempdir): if self.delete: pathfmt.delete = True else: + self.log.info(pathfmt.filename) pathfmt.set_extension("zip") pathfmt.build_path() + def convert_files(self, pathfmt): + if not self._convert_files: + return + self._convert_files = False + + with tempfile.TemporaryDirectory() as tempdir: + for frame in self._files: + + # update frame filename extension + frame["file"] = name = "{}.{}".format( + frame["file"].partition(".")[0], frame["ext"]) + + # move frame into tempdir + try: + self._copy_file(frame["path"], tempdir + "/" + name) + except OSError as exc: + self.log.debug("Unable to copy frame %s (%s: %s)", + name, exc.__class__.__name__, exc) + return + + pathfmt.kwdict["num"] = 0 + self._frames = self._files + if self.convert(pathfmt, tempdir): + self.log.info(pathfmt.filename) + if self.delete: + self.log.debug("Deleting frames") + for frame in self._files: + util.remove_file(frame["path"]) + + def convert(self, pathfmt, tempdir): + pathfmt.set_extension(self.extension) + pathfmt.build_path() + if self.skip and pathfmt.exists(): + return True + + # process frames and collect command-line arguments + args = self._process(pathfmt, tempdir) + if self.args_pp: + args += self.args_pp + if self.args: + args += self.args + + # ensure target directory exists + os.makedirs(pathfmt.realdirectory, exist_ok=True) + + # invoke ffmpeg + try: + if self.twopass: + if "-f" not in self.args: + args += ("-f", self.extension) + args += ("-passlogfile", tempdir + "/ffmpeg2pass", "-pass") + self._exec(args + ["1", "-y", os.devnull]) + self._exec(args + ["2", pathfmt.realpath]) + else: + args.append(pathfmt.realpath) + self._exec(args) + if self._finalize: + self._finalize(pathfmt, tempdir) + except OSError as exc: + print() + self.log.error("Unable to invoke FFmpeg (%s: %s)", + exc.__class__.__name__, exc) + pathfmt.realpath = pathfmt.temppath + except Exception as exc: + print() + self.log.error("%s: %s", exc.__class__.__name__, exc) + self.log.debug("", exc_info=True) + pathfmt.realpath = pathfmt.temppath + else: + if self.mtime: + mtime = pathfmt.kwdict.get("_mtime") + if mtime: + util.set_mtime(pathfmt.realpath, mtime) + return True + def _exec(self, args): self.log.debug(args) out = None if self.output else subprocess.DEVNULL @@ -182,6 +230,9 @@ class UgoiraPP(PostProcessor): raise ValueError() return retcode + def _copy_file(self, src, dst): + shutil.copyfile(src, dst) + def _process_concat(self, pathfmt, tempdir): rate_in, rate_out = self.calculate_framerate(self._frames) args = [self.ffmpeg, "-f", "concat"] diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 5744ef3..ecb496d 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -101,7 +101,7 @@ def raises(cls): return wrap -def identity(x): +def identity(x, _=None): """Returns its argument""" return x @@ -520,14 +520,9 @@ class CustomNone(): """None-style type that supports more operations than regular None""" __slots__ = () - def __getattribute__(self, _): - return self - - def __getitem__(self, _): - return self - - def __iter__(self): - return self + __getattribute__ = identity + __getitem__ = identity + __iter__ = identity def __call__(self, *args, **kwargs): return self @@ -536,10 +531,6 @@ class CustomNone(): def __next__(): raise StopIteration - @staticmethod - def __bool__(): - return False - def __eq__(self, other): return self is other @@ -550,14 +541,48 @@ class CustomNone(): __le__ = true __gt__ = false __ge__ = false + __bool__ = false + + __add__ = identity + __sub__ = identity + __mul__ = identity + __matmul__ = identity + __truediv__ = identity + __floordiv__ = identity + __mod__ = identity + + __radd__ = identity + __rsub__ = identity + __rmul__ = identity + __rmatmul__ = identity + __rtruediv__ = identity + __rfloordiv__ = identity + __rmod__ = identity + + __lshift__ = identity + __rshift__ = identity + __and__ = identity + __xor__ = identity + __or__ = identity + + __rlshift__ = identity + __rrshift__ = identity + __rand__ = identity + __rxor__ = identity + __ror__ = identity + + __neg__ = identity + __pos__ = identity + __abs__ = identity + __invert__ = identity @staticmethod def __len__(): return 0 - @staticmethod - def __hash__(): - return 0 + __int__ = __len__ + __hash__ = __len__ + __index__ = __len__ @staticmethod def __format__(_): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f2462ee..0f9f91b 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.27.3" +__version__ = "1.27.4" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index d4fdedc..fe88c2c 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -18,7 +18,7 @@ def import_module(module_name): if module_name is None: try: return __import__("yt_dlp") - except ImportError: + except (ImportError, SyntaxError): return __import__("youtube_dl") return __import__(module_name.replace("-", "_")) |
