diff options
Diffstat (limited to 'gallery_dl')
| -rw-r--r-- | gallery_dl/config.py | 17 | ||||
| -rw-r--r-- | gallery_dl/downloader/http.py | 3 | ||||
| -rw-r--r-- | gallery_dl/downloader/ytdl.py | 22 | ||||
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/aryion.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/batoto.py | 52 | ||||
| -rw-r--r-- | gallery_dl/extractor/booru.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/civitai.py | 11 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru_v02.py | 21 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 32 | ||||
| -rw-r--r-- | gallery_dl/extractor/newgrounds.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/oauth.py | 20 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 18 | ||||
| -rw-r--r-- | gallery_dl/extractor/shimmie2.py | 61 | ||||
| -rw-r--r-- | gallery_dl/extractor/sizebooru.py | 162 | ||||
| -rw-r--r-- | gallery_dl/extractor/tumblr.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 410 | ||||
| -rw-r--r-- | gallery_dl/extractor/vichan.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/zzup.py | 63 | ||||
| -rw-r--r-- | gallery_dl/version.py | 2 |
23 files changed, 602 insertions, 318 deletions
diff --git a/gallery_dl/config.py b/gallery_dl/config.py index 1873634..33a3b95 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -169,6 +169,7 @@ def remap_categories(): cmap = ( ("coomerparty" , "coomer"), ("kemonoparty" , "kemono"), + ("giantessbooru", "sizebooru"), ("koharu" , "schalenetwork"), ("naver" , "naver-blog"), ("chzzk" , "naver-chzzk"), @@ -185,13 +186,13 @@ def remap_categories(): opts[new] = opts[old] -def load(files=None, strict=False, loads=util.json_loads): +def load(files=None, strict=False, loads=util.json_loads, conf=_config): """Load JSON configuration files""" for pathfmt in files or _default_configs: path = util.expand_path(pathfmt) try: with open(path, encoding="utf-8") as fp: - conf = loads(fp.read()) + config = loads(fp.read()) except OSError as exc: if strict: log.error(exc) @@ -202,17 +203,17 @@ def load(files=None, strict=False, loads=util.json_loads): if strict: raise SystemExit(2) else: - if not _config: - _config.update(conf) + if not conf: + conf.update(config) else: - util.combine_dict(_config, conf) + util.combine_dict(conf, config) _files.append(pathfmt) - if "subconfigs" in conf: - if subconfigs := conf["subconfigs"]: + if "subconfigs" in config: + if subconfigs := config["subconfigs"]: if isinstance(subconfigs, str): subconfigs = (subconfigs,) - load(subconfigs, strict, loads) + load(subconfigs, strict, loads, conf) def clear(): diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 4595483..111fd9b 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -432,6 +432,9 @@ class HttpDownloader(DownloaderBase): if not SIGNATURE_CHECKS[pathfmt.extension](file_header): for ext, check in SIGNATURE_CHECKS.items(): if check(file_header): + self.log.debug( + "Adjusting filename extension of '%s' to '%s'", + pathfmt.filename, ext) pathfmt.set_extension(ext) pathfmt.build_path() return True diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 9659782..9ef8816 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -27,6 +27,7 @@ class YoutubeDLDownloader(DownloaderBase): "socket_timeout": self.config("timeout", extractor._timeout), "nocheckcertificate": not self.config("verify", extractor._verify), "proxy": self.proxies.get("http") if self.proxies else None, + "ignoreerrors": True, } self.ytdl_instance = None @@ -168,15 +169,26 @@ class YoutubeDLDownloader(DownloaderBase): return True def _download_playlist(self, ytdl_instance, pathfmt, info_dict): - pathfmt.set_extension("%(playlist_index)s.%(ext)s") - pathfmt.build_path() - self._set_outtmpl(ytdl_instance, pathfmt.realpath) + pathfmt.kwdict["extension"] = pathfmt.prefix + filename = pathfmt.build_filename(pathfmt.kwdict) + pathfmt.kwdict["extension"] = pathfmt.extension + path = pathfmt.realdirectory + filename + path = path.replace("%", "%%") + "%(playlist_index)s.%(ext)s" + self._set_outtmpl(ytdl_instance, path) + status = False for entry in info_dict["entries"]: + if not entry: + continue if self.rate_dyn is not None: ytdl_instance.params["ratelimit"] = self.rate_dyn() - ytdl_instance.process_info(entry) - return True + try: + ytdl_instance.process_info(entry) + status = True + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.error("%s: %s", exc.__class__.__name__, exc) + return status def _extract_info(self, ytdl, url): return ytdl.extract_info(url, download=False) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 70e79fe..aabaa93 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -170,6 +170,7 @@ modules = [ "sexcom", "shimmie2", "simplyhentai", + "sizebooru", "skeb", "slickpic", "slideshare", @@ -217,7 +218,6 @@ modules = [ "xvideos", "yiffverse", "zerochan", - "zzup", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 8a7cb04..38b8ee4 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -95,7 +95,7 @@ class AryionExtractor(Extractor): cnt += 1 yield post_id - if cnt < 40: + if cnt < 40 and ">Next >><" not in page: return params["p"] += 1 diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 50e0c5d..a7d1b78 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -8,6 +8,7 @@ from .common import Extractor, ChapterExtractor, MangaExtractor from .. import text, util +from ..cache import memcache BASE_PATTERN = (r"(?:https?://)?(" r"(?:ba|d|f|h|j|m|w)to\.to|" @@ -113,8 +114,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): minor = "" return { - "manga" : text.unescape(manga), - "manga_id" : text.parse_int(manga_id), + **_manga_info(self, manga_id), "chapter_url" : extr(self.chapter_id + "-ch_", '"'), "title" : text.unescape(text.remove_html(extr( "selected>", "</option")).partition(" : ")[2]), @@ -151,17 +151,11 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): def chapters(self, page): extr = text.extract_from(page) - if warning := extr(' class="alert alert-warning">', "</div>"): self.log.warning("'%s'", text.remove_html(warning)) - - data = { - "manga_id": text.parse_int(self.manga_id), - "manga" : text.unescape(extr( - "<title>", "<").rpartition(" - ")[0]), - } - extr('<div data-hk="0-0-0-0"', "") + data = _manga_info(self, self.manga_id, page) + results = [] while True: href = extr('<a href="/title/', '"') @@ -179,3 +173,41 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): url = f"{self.root}/title/{href}" results.append((url, data.copy())) return results + + +@memcache(keyarg=1) +def _manga_info(self, manga_id, page=None): + if page is None: + url = f"{self.root}/title/{manga_id}" + page = self.request(url).text + + props = text.extract(page, 'props="', '"', page.find(' prefix="r20" '))[0] + data = util.json_loads(text.unescape(props))["data"][1] + + return { + "manga" : data["name"][1], + "manga_id" : text.parse_int(manga_id), + "manga_slug" : data["slug"][1], + "manga_date" : text.parse_timestamp( + data["dateCreate"][1] // 1000), + "manga_date_updated": text.parse_timestamp( + data["dateUpdate"][1] / 1000), + "author" : json_list(data["authors"]), + "artist" : json_list(data["artists"]), + "genre" : json_list(data["genres"]), + "lang" : data["tranLang"][1], + "lang_orig" : data["origLang"][1], + "status" : data["originalStatus"][1], + "published" : data["originalPubFrom"][1], + "description": data["summary"][1]["code"][1], + "cover" : data["urlCoverOri"][1], + "uploader" : data["userId"][1], + "score" : data["stat_score_avg"][1], + } + + +def json_list(value): + return [ + item[1].replace("_", " ") + for item in util.json_loads(value[1].replace('\\"', '"')) + ] diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 3b97e9a..ae455bf 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -52,7 +52,8 @@ class BooruExtractor(BaseExtractor): if notes: self._notes(post, html) - text.nameext_from_url(url, post) + if "extension" not in post: + text.nameext_from_url(url, post) post.update(data) self._prepare(post) diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 00400ba..d5cf996 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -912,9 +912,16 @@ class CivitaiSearchAPI(): def __init__(self, extractor): self.extractor = extractor self.root = "https://search-new.civitai.com" + + if auth := extractor.config("token"): + if " " not in auth: + auth = f"Bearer {auth}" + else: + auth = ("Bearer 8c46eb2508e21db1e9828a97968d" + "91ab1ca1caa5f70a00e88a2ba1e286603b61") + self.headers = { - "Authorization": "Bearer 8c46eb2508e21db1e9828a97968d91ab1ca1caa5f" - "70a00e88a2ba1e286603b61", + "Authorization": auth, "Content-Type": "application/json", "X-Meilisearch-Client": "Meilisearch instant-meilisearch (v0.13.5)" " ; Meilisearch JavaScript (v0.34.0)", diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 1ee54de..719fc62 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -143,7 +143,7 @@ class Extractor(): return values def request(self, url, method="GET", session=None, fatal=True, - retries=None, retry_codes=None, interval=True, + retries=None, retry_codes=None, expected=(), interval=True, encoding=None, notfound=None, **kwargs): if session is None: session = self.session @@ -202,6 +202,7 @@ class Extractor(): self._dump_response(response) if ( code < 400 or + code in expected or code < 500 and ( not fatal and code != 429 or fatal is None) or fatal is ... diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index b152885..f32059e 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -35,9 +35,8 @@ class GelbooruBase(): data = self.request_json(url, params=params) except exception.HttpError as exc: if exc.status == 401: - raise exception.AuthorizationError( - f"'api-key' and 'user-id' required " - f"({exc.status}: {exc.response.reason})") + raise exception.AuthRequired( + "'api-key' & 'user-id'", "the API") raise if not key: diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index c12a7a2..33db4e4 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -16,17 +16,33 @@ import collections class GelbooruV02Extractor(booru.BooruExtractor): basecategory = "gelbooru_v02" + def __init__(self, match): + booru.BooruExtractor.__init__(self, match) + self.request_interval = self.config_instance("request-interval", 0.0) + self.root_api = self.config_instance("root-api") or self.root + def _init(self): self.api_key = self.config("api-key") self.user_id = self.config("user-id") - self.root_api = self.config_instance("root-api") or self.root if self.category == "rule34": self._file_url = self._file_url_rule34 def _api_request(self, params): + params["api_key"] = self.api_key + params["user_id"] = self.user_id + url = self.root_api + "/index.php?page=dapi&s=post&q=index" - return self.request_xml(url, params=params) + root = self.request_xml(url, params=params) + + if root.tag == "error": + msg = root.text + if msg.lower().startswith("missing authentication"): + raise exception.AuthRequired( + "'api-key' & 'user-id'", "the API", msg) + raise exception.AbortExtraction(f"'{msg}'") + + return root def _pagination(self, params): params["pid"] = self.page_start @@ -148,6 +164,7 @@ BASE_PATTERN = GelbooruV02Extractor.update({ "rule34": { "root": "https://rule34.xxx", "root-api": "https://api.rule34.xxx", + "request-interval": 1.0, "pattern": r"(?:www\.)?rule34\.xxx", }, "safebooru": { diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index b5450d5..fa60f91 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -39,6 +39,7 @@ class InstagramExtractor(Extractor): self.www_claim = "0" self.csrf_token = util.generate_token() self._find_tags = util.re(r"#\w+").findall + self._warn_video_ua = True self._logged_in = True self._cursor = None self._user = None @@ -166,6 +167,7 @@ class InstagramExtractor(Extractor): else: post_url = f"{self.root}/stories/highlights/{reel_id}/" data = { + "user" : post.get("user"), "expires": text.parse_timestamp(expires), "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), @@ -223,8 +225,7 @@ class InstagramExtractor(Extractor): for num, item in enumerate(items, 1): try: - candidates = item["image_versions2"]["candidates"] - image = candidates[0] + image = item["image_versions2"]["candidates"][0] except Exception: self.log.warning("Missing media in post %s", data["post_shortcode"]) @@ -235,17 +236,22 @@ class InstagramExtractor(Extractor): video_versions, key=lambda x: (x["width"], x["height"], x["type"]), ) + manifest = item.get("video_dash_manifest") media = video + + if self._warn_video_ua: + self._warn_video_ua = False + pattern = text.re( + r"AppleWebKit/537\.36 \(KHTML, like Gecko\) " + r"Chrome/\d+\.\d+\.\d+\.\d+ Safari/537\.36$") + if not pattern.search(self.session.headers["User-Agent"]): + self.log.warning("Potentially lowered video quality " + "due to non-Chrome User-Agent") else: - video = None + video = manifest = None media = image - if len(candidates) <= 3 and not post.get("__gdl_gen"): - self.log.warning( - "%s: Image candidate list possibly incomplete " - "(%s items). Consider refreshing your cookies.", - data["post_shortcode"], len(candidates)) - elif image["width"] < item.get("original_width", 0) or \ + if image["width"] < item.get("original_width", 0) or \ image["height"] < item.get("original_height", 0): self.log.warning( "%s: Available image resolutions lower than the " @@ -268,9 +274,14 @@ class InstagramExtractor(Extractor): "video_url" : video["url"] if video else None, "width" : media["width"], "height" : media["height"], - "_ytdl_manifest_data": item.get("video_dash_manifest"), } + if manifest is not None: + media["_ytdl_manifest_data"] = manifest + if "owner" in item: + media["owner2"] = item["owner"] + if "reshared_story_media_author" in item: + media["author"] = item["reshared_story_media_author"] if "expiring_at" in item: media["expires"] = text.parse_timestamp(post["expiring_at"]) @@ -711,7 +722,6 @@ class InstagramAvatarExtractor(InstagramExtractor): "caption" : None, "like_count": 0, "image_versions2": {"candidates": (avatar,)}, - "__gdl_gen" : True, },) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index c42453f..ffb4cad 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -206,6 +206,7 @@ class NewgroundsExtractor(Extractor): data["tags"].sort() data["user"] = self.user or data["artist"][0] + data["slug"] = post_url[post_url.rfind("/")+1:] data["post_url"] = post_url return data diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 2d9a061..ff192c2 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -60,9 +60,23 @@ class OAuthBase(Extractor): pass server.close() - data = self.client.recv(1024).decode() - path = data.split(" ", 2)[1] - return text.parse_query(path.partition("?")[2]) + data = None + try: + data = self.client.recv(1024).decode() + path = data.split(" ", 2)[1] + return text.parse_query(path.partition("?")[2]) + except Exception as exc: + if data is None: + msg = "Failed to receive" + elif not data: + exc = "" + msg = "Received empty" + else: + self.log.warning("Response: %r", data) + msg = "Received invalid" + if exc: + exc = f" ({exc.__class__.__name__}: {exc})" + raise exception.AbortExtraction(f"{msg} OAuth response{exc}") def send(self, msg): """Send 'msg' to the socket opened in 'recv()'""" diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index d34130d..a72042c 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -1251,9 +1251,9 @@ class PixivAppAPI(): "/v1/user/bookmark-tags/illust", params, "bookmark_tags") @memcache(keyarg=1) - def user_detail(self, user_id): + def user_detail(self, user_id, fatal=True): params = {"user_id": user_id} - return self._call("/v1/user/detail", params) + return self._call("/v1/user/detail", params, fatal=fatal) def user_following(self, user_id, restrict="public"): params = {"user_id": user_id, "restrict": restrict} @@ -1261,7 +1261,7 @@ class PixivAppAPI(): def user_illusts(self, user_id): params = {"user_id": user_id} - return self._pagination("/v1/user/illusts", params, user_data="user") + return self._pagination("/v1/user/illusts", params, key_user="user") def user_novels(self, user_id): params = {"user_id": user_id} @@ -1271,7 +1271,7 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] - def _call(self, endpoint, params=None, parse=None): + def _call(self, endpoint, params=None, parse=None, fatal=True): url = "https://app-api.pixiv.net" + endpoint while True: @@ -1283,7 +1283,7 @@ class PixivAppAPI(): else: data = response.json() - if "error" not in data: + if "error" not in data or not fatal: return data self.log.debug(data) @@ -1302,14 +1302,16 @@ class PixivAppAPI(): raise exception.AbortExtraction(f"API request failed: {msg}") def _pagination(self, endpoint, params, - key_items="illusts", key_data=None, user_data=None): + key_items="illusts", key_data=None, key_user=None): data = self._call(endpoint, params) if key_data is not None: self.data = data.get(key_data) - if user_data is not None: - if not data[user_data].get("id"): + if key_user is not None and not data[key_user].get("id"): + user = self.user_detail(self.extractor.user_id, fatal=False) + if user.get("error"): raise exception.NotFoundError("user") + return while True: yield from data[key_items] diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 9afa706..b988646 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -26,9 +26,6 @@ class Shimmie2Extractor(BaseExtractor): if file_url := self.config_instance("file_url"): self.file_url_fmt = file_url - if self.category == "giantessbooru": - self.posts = self._posts_giantessbooru - def items(self): data = self.metadata() @@ -67,11 +64,6 @@ class Shimmie2Extractor(BaseExtractor): BASE_PATTERN = Shimmie2Extractor.update({ - "giantessbooru": { - "root": "https://sizechangebooru.com", - "pattern": r"(?:sizechange|giantess)booru\.com", - "cookies": {"agreed": "true"}, - }, "cavemanon": { "root": "https://booru.cavemanon.xyz", "pattern": r"booru\.cavemanon\.xyz", @@ -85,6 +77,11 @@ BASE_PATTERN = Shimmie2Extractor.update({ "root": "https://vidya.pics", "pattern": r"vidya\.pics", }, + "nozrip": { + "root": "https://noz.rip/booru", + "base": "https://noz.rip", + "pattern": r"noz\.rip/booru", + }, }) + r"/(?:index\.php\?q=/?)?" @@ -154,36 +151,6 @@ class Shimmie2TagExtractor(Shimmie2Extractor): if not extr(f"/{pnum}'>{pnum}<", ">"): return - def _posts_giantessbooru(self): - pnum = text.parse_int(self.groups[-1], 1) - file_url_fmt = (self.root + "/index.php?q=/image/{}.jpg").format - - while True: - url = f"{self.root}/index.php?q=/post/list/{self.tags}/{pnum}" - extr = text.extract_from(self.request(url).text) - - while True: - pid = extr("href='./index.php?q=/post/view/", "&") - if not pid: - break - - tags, dimensions, size = extr("title='", "'").split(" // ") - width, _, height = dimensions.partition("x") - - yield { - "file_url": file_url_fmt(pid), - "id" : pid, - "md5" : "", - "tags" : tags, - "width" : width, - "height" : height, - "size" : text.parse_bytes(size[:-1]), - } - - pnum += 1 - if not extr(f"/{pnum}'>{pnum}<", ">"): - return - class Shimmie2PostExtractor(Shimmie2Extractor): """Extractor for single shimmie2 posts""" @@ -196,13 +163,14 @@ class Shimmie2PostExtractor(Shimmie2Extractor): url = f"{self.root}/post/view/{post_id}" page = self.request(url).text extr = text.extract_from(page) + base = self.config_instance("base", self.root) qt = self._quote_type(page) post = { "id" : post_id, "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : extr("/_thumbs/", "/"), - "file_url": self.root + ( + "file_url": base + ( extr(f"id={qt}main_image{qt} src={qt}", qt) or extr("<source src="+qt, qt)).lstrip("."), "width" : extr("data-width=", " ").strip("\"'"), @@ -215,18 +183,3 @@ class Shimmie2PostExtractor(Shimmie2Extractor): post["md5"] = text.extr(post["file_url"], "/_images/", "/") return (post,) - - def _posts_giantessbooru(self): - post_id = self.groups[-1] - url = f"{self.root}/index.php?q=/post/view/{post_id}" - extr = text.extract_from(self.request(url).text) - - return ({ - "id" : post_id, - "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), - "md5" : "", - "file_url": self.root + extr("id='main_image' src='.", "'"), - "width" : extr("orig_width =", ";"), - "height" : 0, - "size" : 0, - },) diff --git a/gallery_dl/extractor/sizebooru.py b/gallery_dl/extractor/sizebooru.py new file mode 100644 index 0000000..cad4b23 --- /dev/null +++ b/gallery_dl/extractor/sizebooru.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://sizebooru.com/""" + +from .booru import BooruExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?sizebooru\.com" + + +class SizebooruExtractor(BooruExtractor): + """Base class for sizebooru extractors""" + category = "sizebooru" + root = "https://sizebooru.com" + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + page_start = 1 + request_interval = (0.5, 1.5) + + def _init(self): + if self.config("metadata", False): + self._prepare = self._prepare_metadata + + def _file_url(self, post): + post["file_url"] = url = f"{self.root}/Picture/{post['id']}" + return url + + def _prepare(self, post): + post_id = post["id"] + post["id"] = text.parse_int(post_id) + post["filename"] = post_id + if not post["extension"]: + post["extension"] = "jpg" + + def _prepare_metadata(self, post): + post_id = post["id"] + url = f"{self.root}/Details/{post_id}" + extr = text.extract_from(self.request(url).text) + + post.update({ + "id" : text.parse_int(post_id), + "date" : text.parse_datetime( + extr("<b>Posted Date:</b> ", "<"), "%m/%d/%Y"), + "date_approved": text.parse_datetime( + extr("<b>Approved Date:</b> ", "<"), "%m/%d/%Y"), + "approver" : text.remove_html(extr("<b>Approved By:</b>", "</")), + "uploader" : text.remove_html(extr("<b>Posted By:</b>", "</")), + "artist" : None + if (artist := extr("<b>Artist:</b> ", "</")) == "N/A" else # noqa: E131 E501 + text.remove_html(artist), # noqa: E131 + "views" : text.parse_int(extr("<b>Views:</b>", "<")), + "source" : text.extr(extr( + "<b>Source Link:</b>", "</"), ' href="', '"') or None, + "tags" : text.split_html(extr( + "<h6>Related Tags</h6>", "</ul>")), + "favorite" : text.split_html(extr( + "<h6>Favorited By</h6>", "</ul>")), + }) + + post["filename"], _, ext = extr('" alt="', '"').rpartition(".") + if not post["extension"]: + post["extension"] = ext.lower() + + return post + + def _pagination(self, url, callback=None): + params = { + "pageNo" : self.page_start, + "pageSize": self.per_page, + } + + page = self.request(url, params=params).text + if callback is not None: + callback(page) + + while True: + thumb = None + for thumb in text.extract_iter( + page, '<a href="/Details/', ';base64'): + yield { + "id" : thumb[:thumb.find('"')], + "extension": thumb[thumb.rfind("/")+1:], + } + + if "disabled" in text.extr(page, 'area-label="Next"', ">") or \ + thumb is None: + return + params["pageNo"] += 1 + page = self.request(url, params=params).text + + +class SizebooruPostExtractor(SizebooruExtractor): + """Extractor for sizebooru posts""" + subcategory = "post" + pattern = rf"{BASE_PATTERN}/Details/(\d+)" + example = "https://sizebooru.com/Details/12345" + + def posts(self): + return ({"id": self.groups[0], "extension": None},) + + +class SizebooruTagExtractor(SizebooruExtractor): + """Extractor for sizebooru tag searches""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = rf"{BASE_PATTERN}/Search/([^/?#]+)" + example = "https://sizebooru.com/Search/TAG" + + def posts(self): + tag = self.groups[0] + self.kwdict["search_tags"] = text.unquote(tag) + return self._pagination(f"{self.root}/Search/{tag}") + + +class SizebooruGalleryExtractor(SizebooruExtractor): + """Extractor for sizebooru galleries""" + subcategory = "gallery" + directory_fmt = ("{category}", "{gallery_name} ({gallery_id})") + pattern = rf"{BASE_PATTERN}/Galleries/List/(\d+)" + example = "https://sizebooru.com/Galleries/List/123" + + def posts(self): + gid = self.groups[0] + self.kwdict["gallery_id"] = text.parse_int(gid) + return self._pagination( + f"{self.root}/Galleries/List/{gid}", self._extract_name) + + def _extract_name(self, page): + self.kwdict["gallery_name"] = text.unescape(text.extr( + page, "<title>Gallery: ", " - Size Booru<")) + + +class SizebooruUserExtractor(SizebooruExtractor): + """Extractor for a sizebooru user's uploads""" + subcategory = "user" + directory_fmt = ("{category}", "Uploads {user}") + pattern = rf"{BASE_PATTERN}/Profile/Uploads/([^/?#]+)" + example = "https://sizebooru.com/Profile/Uploads/USER" + + def posts(self): + user = self.groups[0] + self.kwdict["user"] = text.unquote(user) + return self._pagination(f"{self.root}/Profile/Uploads/{user}",) + + +class SizebooruFavoriteExtractor(SizebooruExtractor): + """Extractor for a sizebooru user's favorites""" + subcategory = "favorite" + directory_fmt = ("{category}", "Favorites {user}") + pattern = rf"{BASE_PATTERN}/Profile/Favorites/([^/?#]+)" + example = "https://sizebooru.com/Profile/Favorites/USER" + + def posts(self): + user = self.groups[0] + self.kwdict["user"] = text.unquote(user) + return self._pagination(f"{self.root}/Profile/Favorites/{user}",) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 46507c4..6eea76c 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -382,7 +382,7 @@ class TumblrSearchExtractor(TumblrExtractor): example = "https://www.tumblr.com/search/QUERY" def posts(self): - _, _, _, search, mode, post_type, query = self.groups + search, mode, post_type, query = self.groups params = text.parse_query(query) return self.api.search(text.unquote(search), params, mode, post_type) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4303524..c928507 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -16,6 +16,7 @@ import random BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com") +USER_PATTERN = rf"{BASE_PATTERN}/([^/?#]+)" class TwitterExtractor(Extractor): @@ -47,8 +48,9 @@ class TwitterExtractor(Extractor): self.cards_blacklist = self.config("cards-blacklist") if not self.config("transform", True): - self._transform_user = util.identity - self._transform_tweet = util.identity + self._transform_community = \ + self._transform_tweet = \ + self._transform_user = util.identity self._cursor = None self._user = None @@ -412,6 +414,11 @@ class TwitterExtractor(Extractor): content = tget("full_text") or tget("text") or "" entities = legacy["entities"] + if "author_community_relationship" in tweet: + tdata["community"] = self._transform_community( + tweet["author_community_relationship"] + ["community_results"]["result"]) + if hashtags := entities.get("hashtags"): tdata["hashtags"] = [t["text"] for t in hashtags] @@ -453,6 +460,36 @@ class TwitterExtractor(Extractor): return tdata + def _transform_community(self, com): + try: + cid = com.get("id_str") or com["rest_id"] + except KeyError: + return {} + + try: + return self._user_cache[f"C#{cid}"] + except KeyError: + pass + + self._user_cache[f"C#{cid}"] = cdata = { + "id": text.parse_int(cid), + "name": com["name"], + "description": com["description"], + "date": text.parse_timestamp(com["created_at"] // 1000), + "nsfw": com["is_nsfw"], + "role": com["role"], + "member_count": com["member_count"], + "rules": [rule["name"] for rule in com["rules"]], + "admin": (admin := com.get("admin_results")) and + admin["result"]["core"]["screen_name"], # noqa: E131 + "creator": (creator := com.get("creator_results")) and + creator["result"]["core"]["screen_name"], # noqa: E131 + "banner": (banner := com.get("custom_banner_media")) and + banner["media_info"]["original_img_url"], # noqa: E131 + } + + return cdata + def _transform_user(self, user): try: uid = user.get("rest_id") or user["id_str"] @@ -465,35 +502,35 @@ class TwitterExtractor(Extractor): except KeyError: pass - if "legacy" in user: - user = user["legacy"] + core = user.get("core") or user + legacy = user.get("legacy") or user + lget = legacy.get - uget = user.get - if uget("withheld_scope"): - self.log.warning("'%s'", uget("description")) + if lget("withheld_scope"): + self.log.warning("'%s'", lget("description")) - entities = user["entities"] + entities = legacy["entities"] self._user_cache[uid] = udata = { "id" : text.parse_int(uid), - "name" : user["screen_name"], - "nick" : user["name"], - "location" : uget("location"), + "name" : core["screen_name"], + "nick" : core["name"], + "location" : user["location"]["location"], "date" : text.parse_datetime( - uget("created_at"), "%a %b %d %H:%M:%S %z %Y"), - "verified" : uget("verified", False), - "protected" : uget("protected", False), - "profile_banner" : uget("profile_banner_url", ""), - "profile_image" : uget( - "profile_image_url_https", "").replace("_normal.", "."), - "favourites_count": uget("favourites_count"), - "followers_count" : uget("followers_count"), - "friends_count" : uget("friends_count"), - "listed_count" : uget("listed_count"), - "media_count" : uget("media_count"), - "statuses_count" : uget("statuses_count"), + core["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "verified" : user["verification"]["verified"], + "protected" : user["privacy"]["protected"], + "profile_banner" : lget("profile_banner_url", ""), + "profile_image" : user["avatar"]["image_url"].replace( + "_normal.", "."), + "favourites_count": lget("favourites_count"), + "followers_count" : lget("followers_count"), + "friends_count" : lget("friends_count"), + "listed_count" : lget("listed_count"), + "media_count" : lget("media_count"), + "statuses_count" : lget("statuses_count"), } - descr = user["description"] + descr = legacy["description"] if urls := entities["description"].get("urls"): for url in urls: try: @@ -604,34 +641,92 @@ class TwitterExtractor(Extractor): return self.cookies_update(_login_impl(self, username, password)) +class TwitterHomeExtractor(TwitterExtractor): + """Extractor for Twitter home timelines""" + subcategory = "home" + pattern = (BASE_PATTERN + + r"/(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$") + example = "https://x.com/home" + + def tweets(self): + if self.groups[0] is None: + return self.api.home_latest_timeline() + return self.api.home_timeline() + + +class TwitterSearchExtractor(TwitterExtractor): + """Extractor for Twitter search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" + example = "https://x.com/search?q=QUERY" + + def metadata(self): + return {"search": text.unquote(self.user)} + + def tweets(self): + query = text.unquote(self.user.replace("+", " ")) + + user = None + for item in query.split(): + item = item.strip("()") + if item.startswith("from:"): + if user: + user = None + break + else: + user = item[5:] + + if user is not None: + try: + self._assign_user(self.api.user_by_screen_name(user)) + except KeyError: + pass + + return self.api.search_timeline(query) + + +class TwitterHashtagExtractor(TwitterExtractor): + """Extractor for Twitter hashtags""" + subcategory = "hashtag" + pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)" + example = "https://x.com/hashtag/NAME" + + def items(self): + url = f"{self.root}/search?q=%23{self.user}" + data = {"_extractor": TwitterSearchExtractor} + yield Message.Queue, url, data + + class TwitterUserExtractor(Dispatch, TwitterExtractor): """Extractor for a Twitter user""" - pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" + pattern = (BASE_PATTERN + r"/(?:" + r"([^/?#]+)/?(?:$|\?|#)" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") example = "https://x.com/USER" def items(self): user, user_id = self.groups if user_id is not None: - user = "id:" + user_id + user = f"id:{user_id}" base = f"{self.root}/{user}/" return self._dispatch_extractors(( - (TwitterInfoExtractor , base + "info"), - (TwitterAvatarExtractor , base + "photo"), - (TwitterBackgroundExtractor, base + "header_photo"), - (TwitterTimelineExtractor , base + "timeline"), - (TwitterTweetsExtractor , base + "tweets"), - (TwitterMediaExtractor , base + "media"), - (TwitterRepliesExtractor , base + "with_replies"), - (TwitterLikesExtractor , base + "likes"), + (TwitterInfoExtractor , f"{base}info"), + (TwitterAvatarExtractor , f"{base}photo"), + (TwitterBackgroundExtractor, f"{base}header_photo"), + (TwitterTimelineExtractor , f"{base}timeline"), + (TwitterTweetsExtractor , f"{base}tweets"), + (TwitterMediaExtractor , f"{base}media"), + (TwitterRepliesExtractor , f"{base}with_replies"), + (TwitterHighlightsExtractor, f"{base}highlights"), + (TwitterLikesExtractor , f"{base}likes"), ), ("timeline",)) class TwitterTimelineExtractor(TwitterExtractor): """Extractor for a Twitter user timeline""" subcategory = "timeline" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" + pattern = rf"{USER_PATTERN}/timeline(?!\w)" example = "https://x.com/USER/timeline" def _init_cursor(self): @@ -728,7 +823,7 @@ class TwitterTimelineExtractor(TwitterExtractor): class TwitterTweetsExtractor(TwitterExtractor): """Extractor for Tweets from a user's Tweets timeline""" subcategory = "tweets" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)" + pattern = rf"{USER_PATTERN}/tweets(?!\w)" example = "https://x.com/USER/tweets" def tweets(self): @@ -738,17 +833,27 @@ class TwitterTweetsExtractor(TwitterExtractor): class TwitterRepliesExtractor(TwitterExtractor): """Extractor for Tweets from a user's timeline including replies""" subcategory = "replies" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/with_replies(?!\w)" + pattern = rf"{USER_PATTERN}/with_replies(?!\w)" example = "https://x.com/USER/with_replies" def tweets(self): return self.api.user_tweets_and_replies(self.user) +class TwitterHighlightsExtractor(TwitterExtractor): + """Extractor for Tweets from a user's highlights timeline""" + subcategory = "highlights" + pattern = rf"{USER_PATTERN}/highlights(?!\w)" + example = "https://x.com/USER/highlights" + + def tweets(self): + return self.api.user_highlights(self.user) + + class TwitterMediaExtractor(TwitterExtractor): """Extractor for Tweets from a user's Media timeline""" subcategory = "media" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)" + pattern = rf"{USER_PATTERN}/media(?!\w)" example = "https://x.com/USER/media" def tweets(self): @@ -758,7 +863,7 @@ class TwitterMediaExtractor(TwitterExtractor): class TwitterLikesExtractor(TwitterExtractor): """Extractor for liked tweets""" subcategory = "likes" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)" + pattern = rf"{USER_PATTERN}/likes(?!\w)" example = "https://x.com/USER/likes" def metadata(self): @@ -808,7 +913,7 @@ class TwitterListMembersExtractor(TwitterExtractor): class TwitterFollowingExtractor(TwitterExtractor): """Extractor for followed users""" subcategory = "following" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)" + pattern = rf"{USER_PATTERN}/following(?!\w)" example = "https://x.com/USER/following" def items(self): @@ -819,7 +924,7 @@ class TwitterFollowingExtractor(TwitterExtractor): class TwitterFollowersExtractor(TwitterExtractor): """Extractor for a user's followers""" subcategory = "followers" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/followers(?!\w)" + pattern = rf"{USER_PATTERN}/followers(?!\w)" example = "https://x.com/USER/followers" def items(self): @@ -827,52 +932,12 @@ class TwitterFollowersExtractor(TwitterExtractor): return self._users_result(TwitterAPI(self).user_followers(self.user)) -class TwitterSearchExtractor(TwitterExtractor): - """Extractor for Twitter search results""" - subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" - example = "https://x.com/search?q=QUERY" - - def metadata(self): - return {"search": text.unquote(self.user)} - - def tweets(self): - query = text.unquote(self.user.replace("+", " ")) - - user = None - for item in query.split(): - item = item.strip("()") - if item.startswith("from:"): - if user: - user = None - break - else: - user = item[5:] - - if user is not None: - try: - self._assign_user(self.api.user_by_screen_name(user)) - except KeyError: - pass - - return self.api.search_timeline(query) - - -class TwitterHashtagExtractor(TwitterExtractor): - """Extractor for Twitter hashtags""" - subcategory = "hashtag" - pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)" - example = "https://x.com/hashtag/NAME" - - def items(self): - url = f"{self.root}/search?q=%23{self.user}" - data = {"_extractor": TwitterSearchExtractor} - yield Message.Queue, url, data - - class TwitterCommunityExtractor(TwitterExtractor): """Extractor for a Twitter community""" subcategory = "community" + directory_fmt = ("{category}", "Communities", + "{community[name]} ({community[id]})") + archive_fmt = "C_{community[id]}_{tweet_id}_{num}" pattern = BASE_PATTERN + r"/i/communities/(\d+)" example = "https://x.com/i/communities/12345" @@ -885,6 +950,8 @@ class TwitterCommunityExtractor(TwitterExtractor): class TwitterCommunitiesExtractor(TwitterExtractor): """Extractor for followed Twitter communities""" subcategory = "communities" + directory_fmt = TwitterCommunityExtractor.directory_fmt + archive_fmt = TwitterCommunityExtractor.archive_fmt pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$" example = "https://x.com/i/communities" @@ -1002,7 +1069,7 @@ class TwitterQuotesExtractor(TwitterExtractor): class TwitterInfoExtractor(TwitterExtractor): """Extractor for a user's profile data""" subcategory = "info" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/info" + pattern = rf"{USER_PATTERN}/info" example = "https://x.com/USER/info" def items(self): @@ -1021,13 +1088,13 @@ class TwitterAvatarExtractor(TwitterExtractor): subcategory = "avatar" filename_fmt = "avatar {date}.{extension}" archive_fmt = "AV_{user[id]}_{date}" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/photo" + pattern = rf"{USER_PATTERN}/photo" example = "https://x.com/USER/photo" def tweets(self): self.api._user_id_by_screen_name(self.user) user = self._user_obj - url = user["legacy"]["profile_image_url_https"] + url = user["avatar"]["image_url"] if url == ("https://abs.twimg.com/sticky" "/default_profile_images/default_profile_normal.png"): @@ -1043,7 +1110,7 @@ class TwitterBackgroundExtractor(TwitterExtractor): subcategory = "background" filename_fmt = "background {date}.{extension}" archive_fmt = "BG_{user[id]}_{date}" - pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/header_photo" + pattern = rf"{USER_PATTERN}/header_photo" example = "https://x.com/USER/header_photo" def tweets(self): @@ -1169,9 +1236,10 @@ class TwitterAPI(): } self.features = { "hidden_profile_subscriptions_enabled": True, + "payments_enabled": False, + "rweb_xchat_enabled": False, "profile_label_improvements_pcf_label_in_post_enabled": True, "rweb_tipjar_consumption_enabled": True, - "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, "highlights_tweets_tab_ui_enabled": True, "responsive_web_twitter_article_notes_tab_enabled": True, @@ -1179,26 +1247,26 @@ class TwitterAPI(): "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_" "skip_user_profile_image_extensions_enabled": False, - "responsive_web_graphql_" - "timeline_navigation_enabled": True, + "responsive_web_graphql_timeline_navigation_enabled": True, } self.features_pagination = { "rweb_video_screen_enabled": False, + "payments_enabled": False, + "rweb_xchat_enabled": False, "profile_label_improvements_pcf_label_in_post_enabled": True, "rweb_tipjar_consumption_enabled": True, - "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, "creator_subscriptions_tweet_preview_api_enabled": True, - "responsive_web_graphql_" - "timeline_navigation_enabled": True, - "responsive_web_graphql_" - "skip_user_profile_image_extensions_enabled": False, + "responsive_web_graphql" + "_timeline_navigation_enabled": True, + "responsive_web_graphql" + "_skip_user_profile_image_extensions_enabled": False, "premium_content_api_read_enabled": False, "communities_web_enable_tweet_community_results_fetch": True, "c9s_tweet_anatomy_moderator_badge_enabled": True, "responsive_web_grok_analyze_button_fetch_trends_enabled": False, "responsive_web_grok_analyze_post_followups_enabled": True, - "responsive_web_jetfuel_frame": False, + "responsive_web_jetfuel_frame": True, "responsive_web_grok_share_attachment_enabled": True, "articles_preview_enabled": True, "responsive_web_edit_tweet_api_enabled": True, @@ -1212,22 +1280,27 @@ class TwitterAPI(): "creator_subscriptions_quote_tweet_preview_enabled": False, "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, - "tweet_with_visibility_results_" - "prefer_gql_limited_actions_policy_enabled": True, + "tweet_with_visibility_results" + "_prefer_gql_limited_actions_policy_enabled": True, "longform_notetweets_rich_text_read_enabled": True, "longform_notetweets_inline_media_enabled": True, "responsive_web_grok_image_annotation_enabled": True, + "responsive_web_grok_imagine_annotation_enabled": True, + "responsive_web_grok" + "_community_note_auto_translation_is_enabled": False, "responsive_web_enhance_cards_enabled": False, } def tweet_result_by_rest_id(self, tweet_id): - endpoint = "/graphql/Vg2Akr5FzUmF0sTplA5k6g/TweetResultByRestId" + endpoint = "/graphql/qxWQxcMLiTPcavz9Qy5hwQ/TweetResultByRestId" variables = { "tweetId": tweet_id, "withCommunity": False, "includePromotedContent": False, "withVoice": False, } + features = self.features_pagination.copy() + del features["rweb_video_screen_enabled"] field_toggles = { "withArticleRichContentState": True, "withArticlePlainText": False, @@ -1236,7 +1309,7 @@ class TwitterAPI(): } params = { "variables" : self._json_dumps(variables), - "features" : self._json_dumps(self.features_pagination), + "features" : self._json_dumps(features), "fieldToggles": self._json_dumps(field_toggles), } tweet = self._call(endpoint, params)["data"]["tweetResult"]["result"] @@ -1245,16 +1318,16 @@ class TwitterAPI(): if tweet.get("__typename") == "TweetUnavailable": reason = tweet.get("reason") - if reason == "NsfwLoggedOut": - raise exception.AuthorizationError("NSFW Tweet") + if reason in ("NsfwViewerHasNoStatedAge", "NsfwLoggedOut"): + raise exception.AuthRequired(message="NSFW Tweet") if reason == "Protected": - raise exception.AuthorizationError("Protected Tweet") + raise exception.AuthRequired(message="Protected Tweet") raise exception.AbortExtraction(f"Tweet unavailable ('{reason}')") return tweet def tweet_detail(self, tweet_id): - endpoint = "/graphql/b9Yw90FMr_zUb8DvA8r2ug/TweetDetail" + endpoint = "/graphql/iFEr5AcP121Og4wx9Yqo3w/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", @@ -1278,7 +1351,7 @@ class TwitterAPI(): field_toggles=field_toggles) def user_tweets(self, screen_name): - endpoint = "/graphql/M3Hpkrb8pjWkEuGdLeXMOA/UserTweets" + endpoint = "/graphql/E8Wq-_jFSaU7hxVcuOPR9g/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1293,7 +1366,7 @@ class TwitterAPI(): endpoint, variables, field_toggles=field_toggles) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/pz0IHaV_t7T4HJavqqqcIA/UserTweetsAndReplies" + endpoint = "/graphql/-O3QOHrVn1aOm_cF5wyTCQ/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1307,8 +1380,22 @@ class TwitterAPI(): return self._pagination_tweets( endpoint, variables, field_toggles=field_toggles) + def user_highlights(self, screen_name): + endpoint = "/graphql/gmHw9geMTncZ7jeLLUUNOw/UserHighlightsTweets" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + "includePromotedContent": False, + "withVoice": True, + } + field_toggles = { + "withArticlePlainText": False, + } + return self._pagination_tweets( + endpoint, variables, field_toggles=field_toggles) + def user_media(self, screen_name): - endpoint = "/graphql/8B9DqlaGvYyOvTCzzZWtNA/UserMedia" + endpoint = "/graphql/jCRhbOzdgOHp6u9H4g2tEg/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1324,7 +1411,7 @@ class TwitterAPI(): endpoint, variables, field_toggles=field_toggles) def user_likes(self, screen_name): - endpoint = "/graphql/uxjTlmrTI61zreSIV1urbw/Likes" + endpoint = "/graphql/TGEKkJG_meudeaFcqaxM-Q/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1340,7 +1427,7 @@ class TwitterAPI(): endpoint, variables, field_toggles=field_toggles) def user_bookmarks(self): - endpoint = "/graphql/ztCdjqsvvdL0dE8R5ME0hQ/Bookmarks" + endpoint = "/graphql/pLtjrO4ubNh996M_Cubwsg/Bookmarks" variables = { "count": 100, "includePromotedContent": False, @@ -1348,29 +1435,35 @@ class TwitterAPI(): return self._pagination_tweets( endpoint, variables, ("bookmark_timeline_v2", "timeline"), False) - def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/LSefrrxhpeX8HITbKfWz9g/ListLatestTweetsTimeline" - variables = { - "listId": list_id, - "count": 100, - } - return self._pagination_tweets( - endpoint, variables, ("list", "tweets_timeline", "timeline")) - def search_timeline(self, query, product="Latest"): - endpoint = "/graphql/fL2MBiqXPk5pSrOS5ACLdA/SearchTimeline" + endpoint = "/graphql/4fpceYZ6-YQCx_JSl_Cn_A/SearchTimeline" variables = { "rawQuery": query, "count": 100, "querySource": "typed_query", "product": product, + "withGrokTranslatedBio": False, } return self._pagination_tweets( endpoint, variables, ("search_by_raw_query", "search_timeline", "timeline")) + def community_query(self, community_id): + endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery" + params = { + "variables": self._json_dumps({ + "communityId": community_id, + }), + "features": self._json_dumps({ + "c9s_list_members_action_api_enabled": False, + "c9s_superc9s_indication_enabled": False, + }), + } + return (self._call(endpoint, params) + ["data"]["communityResults"]["result"]) + def community_tweets_timeline(self, community_id): - endpoint = "/graphql/awszcpgwaIeqqNfmzjxUow/CommunityTweetsTimeline" + endpoint = "/graphql/Nyt-88UX4-pPCImZNUl9RQ/CommunityTweetsTimeline" variables = { "communityId": community_id, "count": 100, @@ -1384,7 +1477,7 @@ class TwitterAPI(): "timeline")) def community_media_timeline(self, community_id): - endpoint = "/graphql/HfMuDHto2j3NKUeiLjKWHA/CommunityMediaTimeline" + endpoint = "/graphql/ZniZ7AAK_VVu1xtSx1V-gQ/CommunityMediaTimeline" variables = { "communityId": community_id, "count": 100, @@ -1396,7 +1489,7 @@ class TwitterAPI(): "timeline")) def communities_main_page_timeline(self, screen_name): - endpoint = ("/graphql/NbdrKPY_h_nlvZUg7oqH5Q" + endpoint = ("/graphql/p048a9n3hTPppQyK7FQTFw" "/CommunitiesMainPageTimeline") variables = { "count": 100, @@ -1406,6 +1499,27 @@ class TwitterAPI(): endpoint, variables, ("viewer", "communities_timeline", "timeline")) + def home_timeline(self): + endpoint = "/graphql/DXmgQYmIft1oLP6vMkJixw/HomeTimeline" + variables = { + "count": 100, + "includePromotedContent": False, + "latestControlAvailable": True, + "withCommunity": True, + } + return self._pagination_tweets( + endpoint, variables, ("home", "home_timeline_urt")) + + def home_latest_timeline(self): + endpoint = "/graphql/SFxmNKWfN9ySJcXG_tjX8g/HomeLatestTimeline" + variables = { + "count": 100, + "includePromotedContent": False, + "latestControlAvailable": True, + } + return self._pagination_tweets( + endpoint, variables, ("home", "home_timeline_urt")) + def live_event_timeline(self, event_id): endpoint = f"/2/live_event/timeline/{event_id}.json" params = self.params.copy() @@ -1422,8 +1536,17 @@ class TwitterAPI(): return (self._call(endpoint, params) ["twitter_objects"]["live_events"][event_id]) + def list_latest_tweets_timeline(self, list_id): + endpoint = "/graphql/06JtmwM8k_1cthpFZITVVA/ListLatestTweetsTimeline" + variables = { + "listId": list_id, + "count": 100, + } + return self._pagination_tweets( + endpoint, variables, ("list", "tweets_timeline", "timeline")) + def list_members(self, list_id): - endpoint = "/graphql/v97svwb-qcBmzv6QruDuNg/ListMembers" + endpoint = "/graphql/naea_MSad4pOb-D6_oVv_g/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1432,35 +1555,38 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_followers(self, screen_name): - endpoint = "/graphql/jqZ0_HJBA6mnu18iTZYm9w/Followers" + endpoint = "/graphql/i6PPdIMm1MO7CpAqjau7sw/Followers" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, "includePromotedContent": False, + "withGrokTranslatedBio": False, } return self._pagination_users(endpoint, variables) def user_followers_verified(self, screen_name): - endpoint = "/graphql/GHg0X_FjrJoISwwLPWi1LQ/BlueVerifiedFollowers" + endpoint = "/graphql/fxEl9kp1Tgolqkq8_Lo3sg/BlueVerifiedFollowers" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, "includePromotedContent": False, + "withGrokTranslatedBio": False, } return self._pagination_users(endpoint, variables) def user_following(self, screen_name): - endpoint = "/graphql/4QHbs4wmzgtU91f-t96_Eg/Following" + endpoint = "/graphql/SaWqzw0TFAWMx1nXWjXoaQ/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, "includePromotedContent": False, + "withGrokTranslatedBio": False, } return self._pagination_users(endpoint, variables) @memcache(keyarg=1) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/5vdJ5sWkbSRDiiNZvwc2Yg/UserByRestId" + endpoint = "/graphql/8r5oa_2vD0WkhIAOkY4TTA/UserByRestId" features = self.features params = { "variables": self._json_dumps({ @@ -1472,7 +1598,7 @@ class TwitterAPI(): @memcache(keyarg=1) def user_by_screen_name(self, screen_name): - endpoint = "/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName" + endpoint = "/graphql/ck5KkZ8t5cOmoLssopN99Q/UserByScreenName" features = self.features.copy() features["subscriptions_verification_info_" "is_identity_verified_enabled"] = True @@ -1481,6 +1607,7 @@ class TwitterAPI(): params = { "variables": self._json_dumps({ "screen_name": screen_name, + "withGrokTranslatedBio": False, }), "features": self._json_dumps(features), "fieldToggles": self._json_dumps({ @@ -1618,7 +1745,8 @@ class TwitterAPI(): return data elif response.status_code in (403, 404) and \ not self.headers["x-twitter-auth-type"]: - raise exception.AuthorizationError("Login required") + raise exception.AuthRequired( + "authenticated cookies", "timeline") elif response.status_code == 429: self._handle_ratelimit(response) continue @@ -1870,19 +1998,16 @@ class TwitterAPI(): continue if "retweeted_status_result" in legacy: - retweet = legacy["retweeted_status_result"]["result"] - if "tweet" in retweet: - retweet = retweet["tweet"] - if original_retweets: - try: + try: + retweet = legacy["retweeted_status_result"]["result"] + if "tweet" in retweet: + retweet = retweet["tweet"] + if original_retweets: retweet["legacy"]["retweeted_status_id_str"] = \ retweet["rest_id"] retweet["_retweet_id_str"] = tweet["rest_id"] tweet = retweet - except KeyError: - continue - else: - try: + else: legacy["retweeted_status_id_str"] = \ retweet["rest_id"] tweet["author"] = \ @@ -1904,8 +2029,11 @@ class TwitterAPI(): rtlegacy["withheld_scope"] legacy["full_text"] = rtlegacy["full_text"] - except KeyError: - pass + except Exception as exc: + extr.log.debug( + "%s: %s: %s", + tweet.get("rest_id"), exc.__class__.__name__, exc) + continue yield tweet diff --git a/gallery_dl/extractor/vichan.py b/gallery_dl/extractor/vichan.py index f99b5de..cbb44ee 100644 --- a/gallery_dl/extractor/vichan.py +++ b/gallery_dl/extractor/vichan.py @@ -26,6 +26,10 @@ BASE_PATTERN = VichanExtractor.update({ "root": None, "pattern": r"smuglo(?:\.li|li\.net)", }, + "gurochan": { + "root": "https://boards.guro.cx", + "pattern": r"boards\.guro\.cx", + }, }) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 3341594..fca8911 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -63,7 +63,7 @@ class ZerochanExtractor(BooruExtractor): def _parse_entry_html(self, entry_id): url = f"{self.root}/{entry_id}" - page = self.request(url).text + page = self.request(url, expected=(500,)).text try: jsonld = self._extract_jsonld(page) @@ -191,7 +191,7 @@ class ZerochanTagExtractor(ZerochanExtractor): metadata = self.config("metadata") while True: - page = self.request(url, params=params).text + page = self.request(url, params=params, expected=(500,)).text thumbs = text.extr(page, '<ul id="thumbs', '</ul>') extr = text.extract_from(thumbs) diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py deleted file mode 100644 index 7393931..0000000 --- a/gallery_dl/extractor/zzup.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://zzup.com/""" - -from .common import GalleryExtractor -from .. import text - - -class ZzupGalleryExtractor(GalleryExtractor): - category = "zzup" - directory_fmt = ("{category}", "{title}") - filename_fmt = "{num:>03}.{extension}" - archive_fmt = "{slug}_{num}" - root = "https://zzup.com" - pattern = (r"(?:https?://)?(up\.|w+\.)?zzup\.com(/(?:viewalbum|content)" - r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html") - example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html" - - def __init__(self, match): - subdomain, path, self.slug = match.groups() - if subdomain == "up.": - self.root = "https://up.zzup.com" - self.images = self.images_v2 - url = f"{self.root}{path}/index.html" - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - return { - "slug" : self.slug, - "title": text.unescape(text.extr( - page, "<title>", "</title>"))[:-11], - } - - def images(self, page): - path = text.extr(page, 'class="picbox"><a target="_blank" href="', '"') - count = text.parse_int(text.extr(path, "-pics-", "-mirror")) - page = self.request(self.root + path).text - url = self.root + text.extr(page, '\n<a href="', '"') - p1, _, p2 = url.partition("/image0") - p2 = p2[4:] - return [(f"{p1}/image{i:>05}{p2}", None) for i in range(1, count + 1)] - - def images_v2(self, page): - base = f"{self.root}/showimage/" - results = [] - - while True: - for path in text.extract_iter( - page, ' class="picbox"><a target="_blank" href="', '"'): - url = f"{base}{'/'.join(path.split('/')[2:-2])}/zzup.com.jpg" - results.append((url, None)) - - pos = page.find("glyphicon-arrow-right") - if pos < 0: - break - path = text.rextr(page, ' href="', '"', pos) - page = self.request(text.urljoin(self.page_url, path)).text - - return results diff --git a/gallery_dl/version.py b/gallery_dl/version.py index e6913b0..8020352 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.30.4" +__version__ = "1.30.5" __variant__ = None |
