diff options
Diffstat (limited to 'gallery_dl')
| -rw-r--r-- | gallery_dl/downloader/http.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/2chen.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/artstation.py | 11 | ||||
| -rw-r--r-- | gallery_dl/extractor/bcy.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/bunkr.py | 8 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/fapachi.py | 85 | ||||
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 13 | ||||
| -rw-r--r-- | gallery_dl/extractor/hotleak.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/imagehosts.py | 19 | ||||
| -rw-r--r-- | gallery_dl/extractor/inkbunny.py | 36 | ||||
| -rw-r--r-- | gallery_dl/extractor/itaku.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/lolisafe.py | 22 | ||||
| -rw-r--r-- | gallery_dl/extractor/nitter.py | 314 | ||||
| -rw-r--r-- | gallery_dl/extractor/patreon.py | 70 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/reddit.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 38 | ||||
| -rw-r--r-- | gallery_dl/extractor/weibo.py | 45 | ||||
| -rw-r--r-- | gallery_dl/formatter.py | 15 | ||||
| -rw-r--r-- | gallery_dl/job.py | 10 | ||||
| -rw-r--r-- | gallery_dl/path.py | 2 | ||||
| -rw-r--r-- | gallery_dl/version.py | 2 |
24 files changed, 596 insertions, 162 deletions
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 26eb7b5..4037420 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -37,6 +37,7 @@ class HttpDownloader(DownloaderBase): self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") self.retries = self.config("retries", extractor._retries) + self.retry_codes = self.config("retry-codes") self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) self.mtime = self.config("mtime", True) @@ -44,6 +45,8 @@ class HttpDownloader(DownloaderBase): if self.retries < 0: self.retries = float("inf") + if self.retry_codes is None: + self.retry_codes = [429] if self.minsize: minsize = text.parse_bytes(self.minsize) if not minsize: @@ -74,6 +77,8 @@ class HttpDownloader(DownloaderBase): self.log.warning("Invalid rate limit (%r)", self.rate) if self.progress is not None: self.receive = self._receive_rate + if self.progress < 0.0: + self.progress = 0.0 def download(self, url, pathfmt): try: @@ -96,6 +101,13 @@ class HttpDownloader(DownloaderBase): adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) + codes = kwdict.get("_http_retry_codes") + if codes: + retry_codes = self.retry_codes.copy() + retry_codes += codes + else: + retry_codes = self.retry_codes + if self.part and not metadata: pathfmt.part_enable(self.partdir) @@ -156,7 +168,7 @@ class HttpDownloader(DownloaderBase): break else: msg = "'{} {}' for '{}'".format(code, response.reason, url) - if code == 429 or 500 <= code < 600: # Server Error + if code in retry_codes or 500 <= code < 600: continue self.log.warning(msg) return False @@ -295,7 +307,7 @@ class HttpDownloader(DownloaderBase): write(data) if progress is not None: - if time_elapsed >= progress: + if time_elapsed > progress: self.out.progress( bytes_total, bytes_start + bytes_downloaded, diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 8fffeb0..76a085a 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -16,13 +16,15 @@ class _2chenThreadExtractor(Extractor): subcategory = "thread" directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time} {filename}.{extension}" - archive_fmt = "{board}_{thread}_{hash}" + archive_fmt = "{board}_{thread}_{hash}_{time}" root = "https://2chen.moe" pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" test = ( - ("https://2chen.moe/jp/303786", { - "count": ">= 10", + ("https://2chen.moe/tv/496715", { + "count": ">= 179", }), + # 404 + ("https://2chen.moe/jp/303786"), ) def __init__(self, match): @@ -31,7 +33,7 @@ class _2chenThreadExtractor(Extractor): def items(self): url = "{}/{}/{}".format(self.root, self.board, self.thread) - page = self.request(url, encoding="utf-8").text + page = self.request(url, encoding="utf-8", notfound="thread").text data = self.metadata(page) yield Message.Directory, data for post in self.posts(page): @@ -66,7 +68,7 @@ class _2chenThreadExtractor(Extractor): "%d %b %Y (%a) %H:%M:%S" ), "no" : extr('href="#p', '"'), - "url" : extr('</span><a href="', '"'), + "url" : extr('</a><a href="', '"'), "filename": text.unescape(extr('download="', '"')), "hash" : extr('data-hash="', '"'), } @@ -77,7 +79,7 @@ class _2chenBoardExtractor(Extractor): category = "2chen" subcategory = "board" root = "https://2chen.moe" - pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog)?/?$" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog|/?$)" test = ( ("https://2chen.moe/co/", { "pattern": _2chenThreadExtractor.pattern @@ -92,7 +94,7 @@ class _2chenBoardExtractor(Extractor): def items(self): url = "{}/{}/catalog".format(self.root, self.board) - page = self.request(url).text + page = self.request(url, notfound="board").text data = {"_extractor": _2chenThreadExtractor} for thread in text.extract_iter( page, '<figure><a href="', '"'): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index a563bfd..d2bbcbb 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -39,6 +39,7 @@ modules = [ "fallenangels", "fanbox", "fantia", + "fapachi", "flickr", "furaffinity", "fuskator", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index da2d8f2..a3a7c1e 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, util, exception +import itertools import random import string @@ -31,7 +32,12 @@ class ArtstationExtractor(Extractor): def items(self): data = self.metadata() - for project in self.projects(): + projects = self.projects() + max_posts = self.config("max-posts") + if max_posts: + projects = itertools.islice(projects, max_posts) + + for project in projects: for num, asset in enumerate( self.get_project_assets(project["hash_id"]), 1): asset.update(data) @@ -356,7 +362,8 @@ class ArtstationSearchExtractor(ArtstationExtractor): "page" : None, "per_page" : 50, "sorting" : self.sorting, - "pro_first" : "1", + "pro_first" : ("1" if self.config("pro-first", True) else + "0"), "filters" : filters, "additional_fields": (), } diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index 4eb446d..44d6065 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -96,11 +96,13 @@ class BcyExtractor(Extractor): def _data_from_post(self, post_id): url = "{}/item/detail/{}".format(self.root, post_id) page = self.request(url, notfound="post").text - return json.loads( - text.extr(page, 'JSON.parse("', '");') - .replace('\\\\u002F', '/') - .replace('\\"', '"') - )["detail"] + data = (text.extr(page, 'JSON.parse("', '");') + .replace('\\\\u002F', '/') + .replace('\\"', '"')) + try: + return json.loads(data)["detail"] + except ValueError: + return json.loads(data.replace('\\"', '"'))["detail"] class BcyUserExtractor(BcyExtractor): @@ -187,6 +189,10 @@ class BcyPostExtractor(BcyExtractor): ("https://bcy.net/item/detail/6747523535150783495", { "count": 0, }), + # JSON decode error (#3321) + ("https://bcy.net/item/detail/7166939271872388110", { + "count": 0, + }), ) def posts(self): diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index dde9cf8..7e9a422 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -78,11 +78,15 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): self.root = self.root.replace("bunkr", "app.bunkr", 1) return self._fetch_album_api(album_id) + headers = {"Referer": "https://stream.bunkr.is/"} + for file in files: name = file["name"] cdn = file["cdn"] - if name.endswith((".mp4", ".m4v", ".mov")): - cdn = cdn.replace("//cdn", "//media-files") + if name.endswith((".mp4", ".m4v", ".mov", ".webm", + ".zip", ".rar", ".7z")): + cdn = cdn.replace("//cdn", "//media-files", 1) + file["_http_headers"] = headers file["file"] = cdn + "/" + name return files, { diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 906afda..5a44780 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -145,7 +145,6 @@ class DanbooruExtractor(BaseExtractor): ).json()["media_metadata"]["metadata"] ext = data["ZIP:ZipFileName"].rpartition(".")[2] - print(post["id"], ext) fmt = ("{:>06}." + ext).format delays = data["Ugoira:FrameDelays"] return [{"file": fmt(index), "delay": delay} diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py new file mode 100644 index 0000000..ee6d15a --- /dev/null +++ b/gallery_dl/extractor/fapachi.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fapachi.com/""" + +from .common import Extractor, Message +from .. import text + + +class FapachiPostExtractor(Extractor): + """Extractor for individual posts on fapachi.com""" + category = "fapachi" + subcategory = "post" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{user}_{id}.{extension}" + archive_fmt = "{user}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?fapachi\.com" + r"/(?!search/)([^/?#]+)/media/(\d+)") + root = "https://fapachi.com" + test = ( + # NSFW + ("https://fapachi.com/sonson/media/0082", { + "pattern": (r"https://fapachi\.com/models/s/o/" + r"sonson/1/full/sonson_0082\.jpeg"), + "keyword": { + "user": "sonson", + "id" : "0082", + }, + }), + # NSFW + ("https://fapachi.com/ferxiita/media/0159"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user, self.id = match.groups() + + def items(self): + data = { + "user": self.user, + "id" : self.id, + } + page = self.request("{}/{}/media/{}".format( + self.root, self.user, self.id)).text + url = self.root + text.extr(page, 'd-block" src="', '"') + yield Message.Directory, data + yield Message.Url, url, text.nameext_from_url(url, data) + + +class FapachiUserExtractor(Extractor): + """Extractor for all posts from a fapachi user""" + category = "fapachi" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?fapachi\.com" + r"/(?!search(?:/|$))([^/?#]+)(?:/page/(\d+))?$") + root = "https://fapachi.com" + test = ( + ("https://fapachi.com/sonson", { + "pattern": FapachiPostExtractor.pattern, + "range" : "1-50", + "count" : 50, + }), + ("https://fapachi.com/ferxiita/page/3"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.num = text.parse_int(match.group(2), 1) + + def items(self): + data = {"_extractor": FapachiPostExtractor} + while True: + page = self.request("{}/{}/page/{}".format( + self.root, self.user, self.num)).text + for post in text.extract_iter(page, 'model-media-prew">', ">"): + url = self.root + text.extr(post, '<a href="', '"') + yield Message.Queue, url, data + + if '">Next page</a>' not in page: + return + self.num += 1 diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index cc110aa..44459ce 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -115,16 +115,17 @@ class HitomiGalleryExtractor(GalleryExtractor): fmt = self.config("format") or "webp" if fmt == "original": - subdomain, fmt, ext, check = "b", "images", None, False + subdomain, path, ext, check = "b", "images", None, False else: - subdomain, ext, check = "a", fmt, True + subdomain, path, ext, check = "a", fmt, fmt, (fmt != "webp") result = [] for image in self.info["files"]: if check: - if not image.get("has" + fmt): - fmt = ext = "webp" - check = False + if image.get("has" + fmt): + path = ext = fmt + else: + path = ext = "webp" ihash = image["hash"] idata = text.nameext_from_url(image["name"]) if ext: @@ -134,7 +135,7 @@ class HitomiGalleryExtractor(GalleryExtractor): inum = int(ihash[-1] + ihash[-3:-1], 16) url = "https://{}{}.hitomi.la/{}/{}/{}/{}.{}".format( chr(97 + gg_m.get(inum, gg_default)), - subdomain, fmt, gg_b, inum, ihash, idata["extension"], + subdomain, path, gg_b, inum, ihash, idata["extension"], ) result.append((url, idata)) return result diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 01ad38c..eb64db0 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -146,6 +146,7 @@ class HotleakCreatorExtractor(HotleakExtractor): self.wait( until=exc.response.headers.get("X-RateLimit-Reset")) continue + raise posts = response.json() if not posts: diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 622509f..6fcfc55 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -242,6 +242,25 @@ class PixhostImageExtractor(ImagehostImageExtractor): return url, filename +class PixhostGalleryExtractor(ImagehostImageExtractor): + """Extractor for image galleries from pixhost.to""" + category = "pixhost" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" + r"/gallery/([^/?#]+))") + test = ("https://pixhost.to/gallery/jSMFq", { + "pattern": PixhostImageExtractor.pattern, + "count": 3, + }) + + def items(self): + page = text.extr(self.request( + self.page_url).text, 'class="images"', "</div>") + data = {"_extractor": PixhostImageExtractor} + for url in text.extract_iter(page, '<a href="', '"'): + yield Message.Queue, url, data + + class PostimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from postimages.org""" category = "postimg" diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 706cd34..83a1a19 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -30,10 +30,12 @@ class InkbunnyExtractor(Extractor): def items(self): self.api.authenticate() + metadata = self.metadata() to_bool = ("deleted", "favorite", "friends_only", "guest_block", "hidden", "public", "scraps") for post in self.posts(): + post.update(metadata) post["date"] = text.parse_datetime( post["create_datetime"] + "00", "%Y-%m-%d %H:%M:%S.%f%z") post["tags"] = [kw["keyword_name"] for kw in post["keywords"]] @@ -60,6 +62,12 @@ class InkbunnyExtractor(Extractor): url += "?sid=" + self.api.session_id yield Message.Url, url, post + def posts(self): + return () + + def metadata(self): + return () + class InkbunnyUserExtractor(InkbunnyExtractor): """Extractor for inkbunny user profiles""" @@ -144,6 +152,7 @@ class InkbunnyPoolExtractor(InkbunnyExtractor): test = ( ("https://inkbunny.net/poolview_process.php?pool_id=28985", { "count": 9, + "keyword": {"pool_id": "28985"}, }), ("https://inkbunny.net/submissionsviewall.php?rid=ffffffffff" "&mode=pool&pool_id=28985&page=1&orderby=pool_order&random=no"), @@ -160,6 +169,9 @@ class InkbunnyPoolExtractor(InkbunnyExtractor): self.pool_id = params.get("pool_id") self.orderby = params.get("orderby", "pool_order") + def metadata(self): + return {"pool_id": self.pool_id} + def posts(self): params = { "pool_id": self.pool_id, @@ -179,6 +191,7 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): "pattern": r"https://[\w.]+\.metapix\.net/files/full" r"/\d+/\d+_\w+_.+", "range": "20-50", + "keyword": {"favs_user_id": "20969"}, }), ("https://inkbunny.net/submissionsviewall.php?rid=ffffffffff" "&mode=userfavs&random=no&orderby=fav_datetime&page=1&user_id=20969"), @@ -195,6 +208,9 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): self.user_id = params.get("user_id") self.orderby = params.get("orderby", "fav_datetime") + def metadata(self): + return {"favs_user_id": self.user_id} + def posts(self): params = { "favs_user_id": self.user_id, @@ -216,14 +232,30 @@ class InkbunnySearchExtractor(InkbunnyExtractor): "&favsby=&type=&days=&keyword_id=&user_id=&random=&md5="), { "range": "1-10", "count": 10, + "keyword": { + "search": { + "rid": "ffffffffff", + "mode": "search", + "page": "1", + "orderby": "create_datetime", + "text": "cute", + "stringtype": "and", + "keywords": "yes", + "title": "yes", + "description": "no", + }, + }, }) def __init__(self, match): InkbunnyExtractor.__init__(self, match) - self.query = match.group(1) + self.params = text.parse_query(match.group(1)) + + def metadata(self): + return {"search": self.params} def posts(self): - params = text.parse_query(self.query) + params = self.params.copy() pop = params.pop pop("rid", None) diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 00a32cd..4bcedae 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -98,7 +98,7 @@ class ItakuImageExtractor(ItakuExtractor): "image": "https://d1wmr8tlk3viaj.cloudfront.net/gallery_imgs" "/220504_oUNIAFT.png", "image_xl": "https://d1wmr8tlk3viaj.cloudfront.net" - "/gallery_imgs/220504_oUNIAFT/xl.jpg", + "/gallery_imgs/220504_oUNIAFT/lg.jpg", "liked_by_you": False, "maturity_rating": "SFW", "num_comments": int, @@ -107,7 +107,7 @@ class ItakuImageExtractor(ItakuExtractor): "obj_tags": 136446, "owner": 16775, "owner_avatar": "https://d1wmr8tlk3viaj.cloudfront.net" - "/profile_pics/av2022r_vKYVywc/sm.jpg", + "/profile_pics/av2022r_vKYVywc/md.jpg", "owner_displayname": "Piku", "owner_username": "piku", "reshared_by_you": False, @@ -115,8 +115,8 @@ class ItakuImageExtractor(ItakuExtractor): "tags": list, "tags_character": ["hatsune_miku"], "tags_copyright": ["vocaloid"], - "tags_general" : ["female", "green_eyes", "twintails", - "green_hair", "gloves", "flag", + "tags_general" : ["twintails", "green_hair", "flag", + "gloves", "green_eyes", "female", "racing_miku"], "title": "Racing Miku 2022 Ver.", "too_mature": False, @@ -153,7 +153,7 @@ class ItakuAPI(): "owner" : self.user(username)["owner"], "section" : section, "date_range": "", - "maturity_rating": ("SFW", "Questionable", "NSFW", "Extreme"), + "maturity_rating": ("SFW", "Questionable", "NSFW"), "ordering" : "-date_added", "page" : "1", "page_size" : "30", diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 14d4efb..9caf6d7 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -20,10 +20,6 @@ class LolisafeExtractor(BaseExtractor): BASE_PATTERN = LolisafeExtractor.update({ - "zzzz" : { - "root": "https://zz.ht", - "pattern": r"zz\.(?:ht|fo)", - }, "xbunkr": { "root": "https://xbunkr.com", "pattern": r"xbunkr\.com", @@ -35,15 +31,6 @@ class LolisafeAlbumExtractor(LolisafeExtractor): subcategory = "album" pattern = BASE_PATTERN + "/a/([^/?#]+)" test = ( - ("https://zz.ht/a/lop7W6EZ", { - "pattern": r"https://z\.zz\.fo/(4anuY|ih560)\.png", - "count": 2, - "keyword": { - "album_id": "lop7W6EZ", - "album_name": "ferris", - }, - }), - ("https://zz.fo/a/lop7W6EZ"), ("https://xbunkr.com/a/TA0bu3F4", { "pattern": r"https://media\.xbunkr\.com/[^.]+\.\w+", "count": 861, @@ -71,11 +58,10 @@ class LolisafeAlbumExtractor(LolisafeExtractor): yield Message.Directory, data for data["num"], file in enumerate(files, 1): url = file["file"] - if "_fallback" in file: - data["_fallback"] = file["_fallback"] - text.nameext_from_url(url, data) - data["name"], sep, data["id"] = data["filename"].rpartition("-") - yield Message.Url, url, data + file.update(data) + text.nameext_from_url(url, file) + file["name"], sep, file["id"] = file["filename"].rpartition("-") + yield Message.Url, url, file def fetch_album(self, album_id): url = "{}/api/album/get/{}".format(self.root, album_id) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 1ba8253..dfe78ae 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -10,6 +10,7 @@ from .common import BaseExtractor, Message from .. import text +import binascii class NitterExtractor(BaseExtractor): @@ -20,51 +21,102 @@ class NitterExtractor(BaseExtractor): archive_fmt = "{tweet_id}_{num}" def __init__(self, match): + self.cookiedomain = self.root.partition("://")[2] BaseExtractor.__init__(self, match) - self.user = match.group(match.lastindex) + + lastindex = match.lastindex + self.user = match.group(lastindex) + self.user_id = match.group(lastindex + 1) + self.user_obj = None def items(self): - for tweet_html in self.tweets(): - tweet = self._tweet_from_html(tweet_html) - - attachments_html = tweet.pop("_attach", "") - if attachments_html: - attachments = list(text.extract_iter( - attachments_html, 'href="', '"')) - attachments.extend(text.extract_iter( - attachments_html, 'data-url="', '"')) + retweets = self.config("retweets", False) + videos = self.config("videos", True) + if videos: + ytdl = (videos == "ytdl") + videos = True + self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain) + + for tweet in self.tweets(): + + if not retweets and tweet["retweet"]: + self.log.debug("Skipping %s (retweet)", tweet["tweet_id"]) + continue + + attachments = tweet.pop("_attach", "") + if attachments: + files = [] + append = files.append + + for url in text.extract_iter( + attachments, 'href="', '"'): + + if "/enc/" in url: + name = binascii.a2b_base64(url.rpartition( + "/")[2]).decode().rpartition("/")[2] + else: + name = url.rpartition("%2F")[2] + + if url[0] == "/": + url = self.root + url + file = { + "url": url, + "_http_retry_codes": (404,), + } + file["filename"], _, file["extension"] = \ + name.rpartition(".") + append(file) + + if videos and not files: + if ytdl: + append({ + "url": "ytdl:{}/i/status/{}".format( + self.root, tweet["tweet_id"]), + "extension": None, + }) + else: + for url in text.extract_iter( + attachments, 'data-url="', '"'): + + if "/enc/" in url: + name = binascii.a2b_base64(url.rpartition( + "/")[2]).decode().rpartition("/")[2] + else: + name = url.rpartition("%2F")[2] + + if url[0] == "/": + url = self.root + url + append({ + "url" : "ytdl:" + url, + "filename" : name.rpartition(".")[0], + "extension": "mp4", + }) else: - attachments = () - tweet["count"] = len(attachments) + files = () + tweet["count"] = len(files) yield Message.Directory, tweet - for tweet["num"], url in enumerate(attachments, 1): - if url[0] == "/": - url = self.root + url - if "/video/" in url: - url = "ytdl:" + url - tweet["filename"] = url.rpartition( - "%2F")[2].partition(".")[0] - tweet["extension"] = "mp4" - else: - text.nameext_from_url(url, tweet) - yield Message.Url, url, tweet + for tweet["num"], file in enumerate(files, 1): + url = file["url"] + file.update(tweet) + yield Message.Url, url, file def _tweet_from_html(self, html): extr = text.extract_from(html) - user = { + author = { "name": extr('class="fullname" href="/', '"'), "nick": extr('title="', '"'), } extr('<span class="tweet-date', '') link = extr('href="', '"') return { - "user": user, - "date": text.parse_datetime( + "author" : author, + "user" : self.user_obj or author, + "date" : text.parse_datetime( extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), "tweet_id": link.rpartition("/")[2].partition("#")[0], "content": extr('class="tweet-content', "</div").partition(">")[2], - "_attach": extr('class="attachments', 'class="tweet-stats'), + "_attach" : extr('class="attachments', 'class="tweet-stats'), "comments": text.parse_int(extr( 'class="icon-comment', '</div>').rpartition(">")[2]), "retweets": text.parse_int(extr( @@ -73,17 +125,87 @@ class NitterExtractor(BaseExtractor): 'class="icon-quote', '</div>').rpartition(">")[2]), "likes" : text.parse_int(extr( 'class="icon-heart', '</div>').rpartition(">")[2]), + "retweet" : 'class="retweet-header' in html, + "quoted": False, + } + + def _tweet_from_quote(self, html): + extr = text.extract_from(html) + author = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), } + extr('<span class="tweet-date', '') + link = extr('href="', '"') + return { + "author" : author, + "user" : self.user_obj or author, + "date" : text.parse_datetime( + extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), + "tweet_id": link.rpartition("/")[2].partition("#")[0], + "content": extr('class="quote-text', "</div").partition(">")[2], + "_attach" : extr('class="attachments', ''' + </div>'''), + "retweet" : False, + "quoted": True, + } + + def _user_from_html(self, html): + extr = text.extract_from(html, html.index('class="profile-tabs')) + banner = extr('class="profile-banner"><a href="', '"') + return { + "id" : banner.split("%2F")[4] if banner else None, + "profile_banner" : self.root + banner if banner else "", + "profile_image" : self.root + extr( + 'class="profile-card-avatar" href="', '"'), + "nick" : extr('title="', '"'), + "name" : extr('title="@', '"'), + "description" : extr('<p dir="auto">', '<'), + "date" : text.parse_datetime( + extr('class="profile-joindate"><span title="', '"'), + "%I:%M %p - %d %b %Y"), + "statuses_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "friends_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "followers_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "favourites_count": text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "verified" : 'title="Verified account"' in html, + } + + def _extract_quote(self, html): + html, _, quote = html.partition('class="quote') + if quote: + quote, _, tail = quote.partition('class="tweet-published') + return (html + tail, quote) + return (html, None) def _pagination(self, path): - base_url = url = self.root + path + quoted = self.config("quoted", False) + + if self.user_id: + self.user = self.request( + "{}/i/user/{}".format(self.root, self.user_id), + allow_redirects=False, + ).headers["location"].rpartition("/")[2] + base_url = url = "{}/{}{}".format(self.root, self.user, path) while True: - page = self.request(url).text + tweets_html = self.request(url).text.split( + '<div class="timeline-item') - yield from page.split('<div class="timeline-item')[1:] + if self.user_obj is None: + self.user_obj = self._user_from_html(tweets_html[0]) - more = text.extr(page, '<div class="show-more"><a href="?', '"') + for html, quote in map(self._extract_quote, tweets_html[1:]): + yield self._tweet_from_html(html) + if quoted and quote: + yield self._tweet_from_quote(quote) + + more = text.extr( + tweets_html[-1], '<div class="show-more"><a href="?', '"') if not more: return url = base_url + "?" + text.unescape(more) @@ -116,10 +238,12 @@ BASE_PATTERN = NitterExtractor.update({ }, }) +USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)" + class NitterTweetsExtractor(NitterExtractor): subcategory = "tweets" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:/tweets)?(?:$|\?|#)" + pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)" test = ( ("https://nitter.net/supernaturepics", { "pattern": r"https://nitter\.net/pic/orig" @@ -127,6 +251,10 @@ class NitterTweetsExtractor(NitterExtractor): "range": "1-20", "count": 20, "keyword": { + "author": { + "name": "supernaturepics", + "nick": "Nature Pictures" + }, "comments": int, "content": str, "count": 1, @@ -136,25 +264,44 @@ class NitterTweetsExtractor(NitterExtractor): "retweets": int, "tweet_id": r"re:\d+", "user": { + "date": "dt:2015-01-12 10:25:00", + "description": "The very best nature pictures.", + "favourites_count": int, + "followers_count": int, + "friends_count": int, + "id": "2976459548", "name": "supernaturepics", - "nick": "Nature Pictures" + "nick": "Nature Pictures", + "profile_banner": "https://nitter.net/pic/https%3A%2F%2Fpb" + "s.twimg.com%2Fprofile_banners%2F2976459" + "548%2F1421058583%2F1500x500", + "profile_image": "https://nitter.net/pic/pbs.twimg.com%2Fp" + "rofile_images%2F554585280938659841%2FFLV" + "AlX18.jpeg", + "statuses_count": 1568, + "verified": False, }, }, }), + ("https://nitter.pussthecat.org/i/user/2976459548", { + "url": "c740a2683db2c8ed2f350afc0494475c4444025b", + "pattern": r"https://nitter.pussthecat\.org/pic/orig" + r"/media%2FCGMNYZvW0AIVoom\.jpg", + "range": "1", + }), ("https://nitter.lacontrevoie.fr/supernaturepics"), - ("https://nitter.pussthecat.org/supernaturepics"), ("https://nitter.1d4.us/supernaturepics"), - ("https://nitter.kavin.rocks/supernaturepics"), + ("https://nitter.kavin.rocks/id:2976459548"), ("https://nitter.unixfox.eu/supernaturepics"), ) def tweets(self): - return self._pagination("/" + self.user) + return self._pagination("") class NitterRepliesExtractor(NitterExtractor): subcategory = "replies" - pattern = BASE_PATTERN + r"/([^/?#]+)/with_replies" + pattern = USER_PATTERN + r"/with_replies" test = ( ("https://nitter.net/supernaturepics/with_replies", { "pattern": r"https://nitter\.net/pic/orig" @@ -164,37 +311,41 @@ class NitterRepliesExtractor(NitterExtractor): ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"), ("https://nitter.pussthecat.org/supernaturepics/with_replies"), ("https://nitter.1d4.us/supernaturepics/with_replies"), - ("https://nitter.kavin.rocks/supernaturepics/with_replies"), - ("https://nitter.unixfox.eu/supernaturepics/with_replies"), + ("https://nitter.kavin.rocks/id:2976459548/with_replies"), + ("https://nitter.unixfox.eu/i/user/2976459548/with_replies"), ) def tweets(self): - return self._pagination("/" + self.user + "/with_replies") + return self._pagination("/with_replies") class NitterMediaExtractor(NitterExtractor): subcategory = "media" - pattern = BASE_PATTERN + r"/([^/?#]+)/media" + pattern = USER_PATTERN + r"/media" test = ( ("https://nitter.net/supernaturepics/media", { "pattern": r"https://nitter\.net/pic/orig" r"/media%2F[\w-]+\.(jpg|png)$", "range": "1-20", }), + ("https://nitter.kavin.rocks/id:2976459548/media", { + "pattern": r"https://nitter\.kavin\.rocks/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), ("https://nitter.lacontrevoie.fr/supernaturepics/media"), ("https://nitter.pussthecat.org/supernaturepics/media"), ("https://nitter.1d4.us/supernaturepics/media"), - ("https://nitter.kavin.rocks/supernaturepics/media"), - ("https://nitter.unixfox.eu/supernaturepics/media"), + ("https://nitter.unixfox.eu/i/user/2976459548/media"), ) def tweets(self): - return self._pagination("/" + self.user + "/media") + return self._pagination("/media") class NitterSearchExtractor(NitterExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/([^/?#]+)/search" + pattern = USER_PATTERN + r"/search" test = ( ("https://nitter.net/supernaturepics/search", { "pattern": r"https://nitter\.net/pic/orig" @@ -204,12 +355,12 @@ class NitterSearchExtractor(NitterExtractor): ("https://nitter.lacontrevoie.fr/supernaturepics/search"), ("https://nitter.pussthecat.org/supernaturepics/search"), ("https://nitter.1d4.us/supernaturepics/search"), - ("https://nitter.kavin.rocks/supernaturepics/search"), - ("https://nitter.unixfox.eu/supernaturepics/search"), + ("https://nitter.kavin.rocks/id:2976459548/search"), + ("https://nitter.unixfox.eu/i/user/2976459548/search"), ) def tweets(self): - return self._pagination("/" + self.user + "/search") + return self._pagination("/search") class NitterTweetExtractor(NitterExtractor): @@ -218,11 +369,30 @@ class NitterTweetExtractor(NitterExtractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{num}" - pattern = BASE_PATTERN + r"/[^/?#]+/status/(\d+)" + pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())" test = ( ("https://nitter.net/supernaturepics/status/604341487988576256", { "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", + "keyword": { + "comments": 16, + "content": "Big Wedeene River, Canada", + "count": 1, + "date": "dt:2015-05-29 17:40:00", + "extension": "jpg", + "filename": "CGMNYZvW0AIVoom", + "likes": int, + "num": 1, + "quotes": 10, + "retweets": int, + "tweet_id": "604341487988576256", + "url": "https://nitter.net/pic/orig" + "/media%2FCGMNYZvW0AIVoom.jpg", + "user": { + "name": "supernaturepics", + "nick": "Nature Pictures", + }, + }, }), # 4 images ("https://nitter.lacontrevoie.fr/i/status/894001459754180609", { @@ -234,6 +404,10 @@ class NitterTweetExtractor(NitterExtractor): r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F" r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F" r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", + "keyword": { + "extension": "mp4", + "filename": "nv8hUQC1R0SjhzcZ", + }, }), # content with emoji, newlines, hashtags (#338) ("https://nitter.1d4.us/playpokemon/status/1263832915173048321", { @@ -249,8 +423,48 @@ class NitterTweetExtractor(NitterExtractor): "url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a", "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", }), + # Reply to deleted tweet (#403, #838) + ("https://nitter.unixfox.eu/i/web/status/1170041925560258560", { + "pattern": r"https://nitter\.unixfox\.eu/pic/orig" + r"/media%2FEDzS7VrU0AAFL4_\.jpg", + }), + # "quoted" option (#854) + ("https://nitter.net/StobiesGalaxy/status/1270755918330896395", { + "options": (("quoted", True),), + "pattern": r"https://nitter\.net/pic/orig/media%2FEa[KG].+\.jpg", + "count": 8, + }), + # quoted tweet (#526, #854) + ("https://nitter.1d4.us/StobiesGalaxy/status/1270755918330896395", { + "pattern": r"https://nitter\.1d4\.us/pic/orig" + r"/enc/bWVkaWEvRWFL\w+LmpwZw==", + "keyword": {"filename": r"re:EaK.{12}"}, + "count": 4, + }), + # deleted quote tweet (#2225) + ("https://nitter.lacontrevoie.fr/i/status/1460044411165888515", { + "count": 0, + }), + # "Misleading" content + ("https://nitter.pussthecat.org/i/status/1486373748911575046", { + "count": 4, + }), + # age-restricted (#2354) + ("https://nitter.unixfox.eu/mightbecurse/status/1492954264909479936", { + "keywords": {"date": "dt:2022-02-13 20:10:09"}, + "count": 1, + }), ) def tweets(self): url = "{}/i/status/{}".format(self.root, self.user) - return (self.request(url).text,) + html = text.extr(self.request(url).text, 'class="main-tweet', '''\ + </div> + </div></div></div>''') + html, quote = self._extract_quote(html) + tweet = self._tweet_from_html(html) + if quote and self.config("quoted", False): + quoted = self._tweet_from_quote(quote) + quoted["user"] = tweet["user"] + return (tweet, quoted) + return (tweet,) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 59c5f15..1f520c3 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -103,7 +103,10 @@ class PatreonExtractor(Extractor): """Return all relevant post objects""" def _pagination(self, url): - headers = {"Referer": self.root} + headers = { + "Referer" : self.root + "/", + "Content-Type": "application/vnd.api+json", + } while url: url = text.ensure_http_scheme(url) @@ -199,23 +202,36 @@ class PatreonExtractor(Extractor): return ( "https://www.patreon.com/api/" + endpoint + - "?include=user,images,attachments,user_defined_tags,campaign,poll." - "choices,poll.current_user_responses.user,poll.current_user_respon" - "ses.choice,poll.current_user_responses.poll,access_rules.tier.nul" - "l" - - "&fields[post]=change_visibility_at,comment_count,content,current_" - "user_can_delete,current_user_can_view,current_user_has_liked,embe" - "d,image,is_paid,like_count,min_cents_pledged_to_view,post_file,pu" - "blished_at,patron_count,patreon_url,post_type,pledge_url,thumbnai" - "l_url,teaser_text,title,upgrade_url,url,was_posted_by_campaign_ow" - "ner" + "?include=campaign,access_rules,attachments,audio,images,media," + "native_video_insights,poll.choices," + "poll.current_user_responses.user," + "poll.current_user_responses.choice," + "poll.current_user_responses.poll," + "user,user_defined_tags,ti_checks" + + "&fields[campaign]=currency,show_audio_post_download_links," + "avatar_photo_url,avatar_photo_image_urls,earnings_visibility," + "is_nsfw,is_monthly,name,url" + + "&fields[post]=change_visibility_at,comment_count,commenter_count," + "content,current_user_can_comment,current_user_can_delete," + "current_user_can_view,current_user_has_liked,embed,image," + "insights_last_updated_at,is_paid,like_count,meta_image_url," + "min_cents_pledged_to_view,post_file,post_metadata,published_at," + "patreon_url,post_type,pledge_url,preview_asset_type,thumbnail," + "thumbnail_url,teaser_text,title,upgrade_url,url," + "was_posted_by_campaign_owner,has_ti_violation,moderation_status," + "post_level_suspension_removal_date,pls_one_liners_by_category," + "video_preview,view_count" + + "&fields[post_tag]=tag_type,value" "&fields[user]=image_url,full_name,url" - "&fields[campaign]=avatar_photo_url,earnings_visibility,is_nsfw,is" - "_monthly,name,url" - "&fields[access_rule]=access_rule_type,amount_cents" + query + + "&fields[access_rule]=access_rule_type,amount_cents" + "&fields[media]=id,image_urls,download_url,metadata,file_name" + "&fields[native_video_insights]=average_view_duration," + "average_view_pct,has_preview,id,last_updated_at,num_views," + "preview_views,video_duration" + query + - "&json-api-use-default-includes=false" "&json-api-version=1.0" ) @@ -234,6 +250,10 @@ class PatreonExtractor(Extractor): filetypes = filetypes.split(",") return [genmap[ft] for ft in filetypes] + def _extract_bootstrap(self, page): + return json.loads(text.extr( + page, "window.patreon.bootstrap,", "\n});") + "}") + class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" @@ -282,10 +302,12 @@ class PatreonCreatorExtractor(PatreonExtractor): url = "{}/user/posts?u={}".format(self.root, creator_id) else: url = "{}/{}/posts".format(self.root, self.creator) - page = self.request(url, notfound="creator").text - campaign_id = text.extr(page, "/campaign/", "/") - if not campaign_id: + + try: + data = self._extract_bootstrap(page) + campaign_id = data["creator"]["data"]["id"] + except (KeyError, ValueError): raise exception.NotFoundError("creator") filters = "".join( @@ -295,10 +317,10 @@ class PatreonCreatorExtractor(PatreonExtractor): ) url = self._build_url("posts", ( - "&sort=" + query.get("sort", "-published_at") + - "&filter[is_draft]=false" + "&filter[campaign_id]=" + campaign_id + "&filter[contains_exclusive_posts]=true" - "&filter[campaign_id]=" + campaign_id + filters + "&filter[is_draft]=false" + filters + + "&sort=" + query.get("sort", "-published_at") )) return self._pagination(url) @@ -313,6 +335,7 @@ class PatreonUserExtractor(PatreonExtractor): url = self._build_url("stream", ( "&page[cursor]=null" "&filter[is_following]=true" + "&json-api-use-default-includes=false" )) return self._pagination(url) @@ -347,8 +370,7 @@ class PatreonPostExtractor(PatreonExtractor): def posts(self): url = "{}/posts/{}".format(self.root, self.slug) page = self.request(url, notfound="post").text - data = text.extract(page, "window.patreon.bootstrap,", "\n});")[0] - post = json.loads(data + "}")["post"] + post = self._extract_bootstrap(page)["post"] included = self._transform(post["included"]) return (self._process(post["data"], included),) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index fc092f1..9cd95bb 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -37,7 +37,7 @@ class PixivExtractor(Extractor): transform_tags = None elif tags == "translated": def transform_tags(work): - work["tags"] = list(set( + work["tags"] = list(dict.fromkeys( tag["translated_name"] or tag["name"] for tag in work["tags"])) else: diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 954a84f..0ec8478 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -69,6 +69,11 @@ class RedditExtractor(Extractor): submission["_ytdl_extra"] = { "title": submission["title"], } + try: + url = (submission["secure_media"] + ["reddit_video"]["dash_url"]) + except (KeyError, TypeError): + pass yield Message.Url, "ytdl:" + url, submission elif not submission["is_self"]: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 3dbadaa..22d4a6e 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -417,7 +417,11 @@ class TwitterTimelineExtractor(TwitterExtractor): "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", }), # suspended account (#2216) - ("https://twitter.com/realDonaldTrump", { + ("https://twitter.com/OptionalTypo", { + "exception": exception.NotFoundError, + }), + # suspended account user ID + ("https://twitter.com/id:772949683521978368", { "exception": exception.NotFoundError, }), ("https://mobile.twitter.com/supernaturepics?p=i"), @@ -1149,25 +1153,21 @@ class TwitterAPI(): return self._call(endpoint, params)["data"]["user"]["result"] def _user_id_by_screen_name(self, screen_name): - if screen_name.startswith("id:"): - user_id = screen_name[3:] - user = self.user_by_rest_id(user_id) - - else: - user = () - try: + user = () + try: + if screen_name.startswith("id:"): + user = self.user_by_rest_id(screen_name[3:]) + else: user = self.user_by_screen_name(screen_name) - user_id = user["rest_id"] - except KeyError: - if "unavailable_message" in user: - raise exception.NotFoundError("{} ({})".format( - user["unavailable_message"].get("text"), - user.get("reason")), False) - else: - raise exception.NotFoundError("user") - - self.extractor._assign_user(user) - return user_id + self.extractor._assign_user(user) + return user["rest_id"] + except KeyError: + if "unavailable_message" in user: + raise exception.NotFoundError("{} ({})".format( + user["unavailable_message"].get("text"), + user.get("reason")), False) + else: + raise exception.NotFoundError("user") @cache(maxage=3600) def _guest_token(self): diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 55cee14..ab05c48 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache -import itertools import random import json @@ -53,20 +52,20 @@ class WeiboExtractor(Extractor): for status in self.statuses(): + files = [] if self.retweets and "retweeted_status" in status: if original_retweets: status = status["retweeted_status"] - files = self._files_from_status(status) + self._extract_status(status, files) else: - files = itertools.chain( - self._files_from_status(status), - self._files_from_status(status["retweeted_status"]), - ) + self._extract_status(status, files) + self._extract_status(status["retweeted_status"], files) else: - files = self._files_from_status(status) + self._extract_status(status, files) status["date"] = text.parse_datetime( status["created_at"], "%a %b %d %H:%M:%S %z %Y") + status["count"] = len(files) yield Message.Directory, status for num, file in enumerate(files, 1): @@ -78,7 +77,9 @@ class WeiboExtractor(Extractor): file["num"] = num yield Message.Url, file["url"], file - def _files_from_status(self, status): + def _extract_status(self, status, files): + append = files.append + pic_ids = status.get("pic_ids") if pic_ids: pics = status["pic_infos"] @@ -87,18 +88,18 @@ class WeiboExtractor(Extractor): pic_type = pic.get("type") if pic_type == "gif" and self.videos: - yield {"url": pic["video"]} + append({"url": pic["video"]}) elif pic_type == "livephoto" and self.livephoto: - yield pic["largest"].copy() + append(pic["largest"].copy()) file = {"url": pic["video"]} file["filehame"], _, file["extension"] = \ pic["video"].rpartition("%2F")[2].rpartition(".") - yield file + append(file) else: - yield pic["largest"].copy() + append(pic["largest"].copy()) if "page_info" in status and self.videos: try: @@ -106,8 +107,12 @@ class WeiboExtractor(Extractor): key=lambda m: m["meta"]["quality_index"]) except KeyError: pass + except ValueError: + info = status["page_info"]["media_info"] + append({"url": (info.get("stream_url_hd") or + info["stream_url"])}) else: - yield media["play_info"].copy() + append(media["play_info"].copy()) def _status_by_id(self, status_id): url = "{}/ajax/statuses/show?id={}".format(self.root, status_id) @@ -344,7 +349,10 @@ class WeiboStatusExtractor(WeiboExtractor): test = ( ("https://m.weibo.cn/detail/4323047042991618", { "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg", - "keyword": {"status": {"date": "dt:2018-12-30 13:56:36"}}, + "keyword": {"status": { + "count": 1, + "date": "dt:2018-12-30 13:56:36", + }}, }), ("https://m.weibo.cn/detail/4339748116375525", { "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_1080p", @@ -375,6 +383,13 @@ class WeiboStatusExtractor(WeiboExtractor): ("https://weibo.com/2909128931/4409545658754086", { "count": 9, }), + # empty 'playback_list' (#3301) + ("https://weibo.com/1501933722/4142890299009993", { + "pattern": r"https://f\.us\.sinaimg\.cn/004zstGKlx07dAHg4ZVu010f01" + r"000OOl0k01\.mp4\?label=mp4_hd&template=template_7&ori" + r"=0&ps=1CwnkDw1GXwCQx.+&KID=unistore,video", + "count": 1, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), ) @@ -387,6 +402,6 @@ class WeiboStatusExtractor(WeiboExtractor): return (status,) -@cache(maxage=356*86400) +@cache(maxage=365*86400) def _cookie_cache(): return None diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index dd32b8a..ca05fa5 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -347,6 +347,20 @@ def _parse_offset(format_spec, default): return off +def _parse_sort(format_spec, default): + args, _, format_spec = format_spec.partition(_SEPARATOR) + fmt = _build_format_func(format_spec, default) + + if "d" in args or "r" in args: + def sort_desc(obj): + return fmt(sorted(obj, reverse=True)) + return sort_desc + else: + def sort_asc(obj): + return fmt(sorted(obj)) + return sort_asc + + def _default_format(format_spec, default): def wrap(obj): return format(obj, format_spec) @@ -395,4 +409,5 @@ _FORMAT_SPECIFIERS = { "J": _parse_join, "O": _parse_offset, "R": _parse_replace, + "S": _parse_sort, } diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 1f65438..e1a6767 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -13,7 +13,7 @@ import logging import functools import collections from . import extractor, downloader, postprocessor -from . import config, text, util, path, formatter, output, exception +from . import config, text, util, path, formatter, output, exception, version from .extractor.message import Message from .output import stdout_write @@ -55,6 +55,8 @@ class Job(): self.metadata_url = extr.config("url-metadata") self.metadata_http = extr.config("http-metadata") + + version_info = extr.config("version-metadata") metadata_path = extr.config("path-metadata") # user-supplied metadata @@ -63,6 +65,12 @@ class Job(): self.kwdict.update(kwdict) if metadata_path: self.kwdict[metadata_path] = path_proxy + if version_info: + self.kwdict[version_info] = { + "version" : version.__version__, + "is_executable" : getattr(sys, "frozen", False), + "current_git_head": util.git_head() + } # predicates self.pred_url = self._prepare_predicates("image", True) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index e901fb9..7d599ee 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -219,7 +219,7 @@ class PathFormat(): self.path = self.path[:-1] self.temppath = self.realpath = self.realpath[:-1] elif not self.temppath: - self.build_path() + self.path = self.directory + "?" return True def build_filename(self, kwdict): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 31dbc63..d289009 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.24.0" +__version__ = "1.24.1" |
