diff options
Diffstat (limited to 'gallery_dl')
| -rw-r--r-- | gallery_dl/exception.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/blogger.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 44 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 26 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 14 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangadex.py | 9 | ||||
| -rw-r--r-- | gallery_dl/extractor/newgrounds.py | 8 | ||||
| -rw-r--r-- | gallery_dl/extractor/philomena.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/reddit.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/sexcom.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 500 | ||||
| -rw-r--r-- | gallery_dl/option.py | 5 | ||||
| -rw-r--r-- | gallery_dl/version.py | 4 |
15 files changed, 491 insertions, 164 deletions
diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index 0433dc9..5120039 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -33,12 +33,12 @@ class GalleryDLException(Exception): msgfmt = None code = 1 - def __init__(self, message=None): + def __init__(self, message=None, fmt=True): if not message: message = self.default elif isinstance(message, Exception): message = "{}: {}".format(message.__class__.__name__, message) - if self.msgfmt: + if self.msgfmt and fmt: message = self.msgfmt.format(message) Exception.__init__(self, message) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 9a86cc4..eef87f9 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,9 +41,11 @@ class BloggerExtractor(Extractor): blog["date"] = text.parse_datetime(blog["published"]) del blog["selfLink"] - sub = re.compile(r"/(?:s\d+|w\d+-h\d+)/").sub + sub = re.compile(r"(/|=)(?:s\d+|w\d+-h\d+)(?=/|$)").sub findall_image = re.compile( - r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall + r'src="(https?://(?:' + r'blogger\.googleusercontent\.com/img|' + r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall @@ -52,7 +54,7 @@ class BloggerExtractor(Extractor): files = findall_image(content) for idx, url in enumerate(files): - files[idx] = sub("/s0/", url).replace("http:", "https:", 1) + files[idx] = sub(r"\1s0", url).replace("http:", "https:", 1) if self.videos and 'id="BLOG_video-' in content: page = self.request(post["url"]).text @@ -137,6 +139,12 @@ class BloggerPostExtractor(BloggerExtractor): ("https://aaaninja.blogspot.com/2020/08/altera-boob-press-2.html", { "pattern": r"https://1.bp.blogspot.com/.+/s0/altera_.+png", }), + # new image domain (#2204) + (("https://randomthingsthroughmyletterbox.blogspot.com/2022/01" + "/bitter-flowers-by-gunnar-staalesen-blog.html"), { + "pattern": r"https://blogger.googleusercontent.com/img/a/.+=s0$", + "count": 8, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index afe4a16..52e5199 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -220,6 +220,14 @@ class Extractor(): headers = session.headers headers.clear() + source_address = self.config("source-address") + if source_address: + if isinstance(source_address, str): + source_address = (source_address, 0) + else: + source_address = (source_address[0], source_address[1]) + session.mount("http://", SourceAdapter(source_address)) + browser = self.config("browser") or self.browser if browser and isinstance(browser, str): browser, _, platform = browser.lower().partition(":") @@ -235,10 +243,12 @@ class Extractor(): platform = "Macintosh; Intel Mac OS X 11.5" if browser == "chrome": - _emulate_browser_chrome(session, platform) + _emulate_browser_chrome(session, platform, source_address) else: - _emulate_browser_firefox(session, platform) + _emulate_browser_firefox(session, platform, source_address) else: + if source_address: + session.mount("https://", SourceAdapter(source_address)) headers["User-Agent"] = self.config("user-agent", ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64; " "rv:91.0) Gecko/20100101 Firefox/91.0")) @@ -605,26 +615,44 @@ class BaseExtractor(Extractor): ) +class SourceAdapter(HTTPAdapter): + + def __init__(self, source_address): + self.source_address = source_address + HTTPAdapter.__init__(self) + + def init_poolmanager(self, *args, **kwargs): + kwargs["source_address"] = self.source_address + return HTTPAdapter.init_poolmanager(self, *args, **kwargs) + + def proxy_manager_for(self, *args, **kwargs): + kwargs["source_address"] = self.source_address + return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) + + class HTTPSAdapter(HTTPAdapter): - def __init__(self, ciphers): + def __init__(self, ciphers, source_address=None): context = self.ssl_context = ssl.create_default_context() context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) context.set_ecdh_curve("prime256v1") context.set_ciphers(ciphers) + self.source_address = source_address HTTPAdapter.__init__(self) def init_poolmanager(self, *args, **kwargs): kwargs["ssl_context"] = self.ssl_context + kwargs["source_address"] = self.source_address return HTTPAdapter.init_poolmanager(self, *args, **kwargs) def proxy_manager_for(self, *args, **kwargs): kwargs["ssl_context"] = self.ssl_context + kwargs["source_address"] = self.source_address return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) -def _emulate_browser_firefox(session, platform): +def _emulate_browser_firefox(session, platform, source_address): headers = session.headers headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:91.0) " "Gecko/20100101 Firefox/91.0") @@ -654,11 +682,12 @@ def _emulate_browser_firefox(session, platform): "DHE-RSA-AES256-SHA:" "AES128-SHA:" "AES256-SHA:" - "DES-CBC3-SHA" + "DES-CBC3-SHA", + source_address )) -def _emulate_browser_chrome(session, platform): +def _emulate_browser_chrome(session, platform, source_address): if platform.startswith("Macintosh"): platform = platform.replace(".", "_") + "_2" @@ -690,7 +719,8 @@ def _emulate_browser_chrome(session, platform): "AES256-GCM-SHA384:" "AES128-SHA:" "AES256-SHA:" - "DES-CBC3-SHA" + "DES-CBC3-SHA", + source_address )) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index a6bda52..fd26192 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2021 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,7 +59,7 @@ class GelbooruBase(): @staticmethod def _file_url(post): url = post["file_url"] - if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")): + if url.endswith((".webm", ".mp4")): md5 = post["md5"] path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5) post["_fallback"] = GelbooruBase._video_fallback(path) diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index ce6c7ce..e132bf9 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -159,6 +159,7 @@ class HitomiTagExtractor(Extractor): """Extractor for galleries from tag searches on hitomi.la""" category = "hitomi" subcategory = "tag" + root = "https://hitomi.la" pattern = (r"(?:https?://)?hitomi\.la/" r"(tag|artist|group|series|type|character)/" r"([^/?#]+)\.html") @@ -183,12 +184,29 @@ class HitomiTagExtractor(Extractor): self.tag = tag def items(self): - url = "https://ltn.hitomi.la/{}/{}.nozomi".format(self.type, self.tag) data = {"_extractor": HitomiGalleryExtractor} + nozomi_url = "https://ltn.hitomi.la/{}/{}.nozomi".format( + self.type, self.tag) + headers = { + "Origin": self.root, + "Cache-Control": "max-age=0", + } - for gallery_id in decode_nozomi(self.request(url).content): - url = "https://hitomi.la/galleries/{}.html".format(gallery_id) - yield Message.Queue, url, data + offset = 0 + while True: + headers["Referer"] = "{}/{}/{}.html?page={}".format( + self.root, self.type, self.tag, offset // 100 + 1) + headers["Range"] = "bytes={}-{}".format(offset, offset+99) + nozomi = self.request(nozomi_url, headers=headers).content + + for gallery_id in decode_nozomi(nozomi): + gallery_url = "{}/galleries/{}.html".format( + self.root, gallery_id) + yield Message.Queue, gallery_url, data + + if len(nozomi) < 100: + return + offset += 100 @memcache() diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 781bf01..20a4c1a 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -748,13 +748,19 @@ class InstagramHighlightsExtractor(InstagramExtractor): endpoint = "/v1/highlights/{}/highlights_tray/".format(user["id"]) tray = self._request_api(endpoint)["tray"] - reel_ids = [highlight["id"] for highlight in tray] + + # Anything above 30 responds with statuscode 400. + # 30 can work, however, sometimes the API will respond with 560 or 500. + chunk_size = 5 endpoint = "/v1/feed/reels_media/" - params = {"reel_ids": reel_ids} - reels = self._request_api(endpoint, params=params)["reels"] - return [reels[rid] for rid in reel_ids] + for offset in range(0, len(reel_ids), chunk_size): + chunk_ids = reel_ids[offset : offset+chunk_size] + params = {"reel_ids": chunk_ids} + reels = self._request_api(endpoint, params=params)["reels"] + for reel_id in chunk_ids: + yield reels[reel_id] class InstagramReelsExtractor(InstagramExtractor): diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index f1d7bcf..beb992c 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -180,7 +180,7 @@ class KemonopartyExtractor(Extractor): for dm in text.extract_iter(page, "<article", "</article>"): dms.append({ "body": text.unescape(text.extract( - dm, '<div class="dm-card__content">', '</div>', + dm, '<pre>', '</pre></section>', )[0].strip()), "date": text.extract(dm, 'datetime="', '"')[0], }) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index ea5d4a8..152da4f 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -73,6 +73,7 @@ class MangadexExtractor(Extractor): "lang" : lang, "language": util.code_to_language(lang), "count" : cattributes["pages"], + "_external_url": cattributes.get("externalUrl"), } data["artist"] = [artist["attributes"]["name"] @@ -112,6 +113,12 @@ class MangadexChapterExtractor(MangadexExtractor): chapter = self.api.chapter(self.uuid) data = self._transform(chapter) + if data.get("_external_url"): + raise exception.StopExtraction( + "Chapter %s%s is not available on MangaDex and can instead be " + "read on the official publisher's website at %s.", + data["chapter"], data["chapter_minor"], data["_external_url"]) + yield Message.Directory, data data["_http_headers"] = self._headers diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 8bcbc20..54e2040 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -529,6 +529,12 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): self.query = text.parse_query(query) def posts(self): + suitabilities = self.query.get("suitabilities") + if suitabilities: + data = {"view_suitability_" + s: "on" + for s in suitabilities.split(",")} + self.request(self.root + "/suitabilities", + method="POST", data=data) return self._pagination("/search/conduct/" + self._path, self.query) def metadata(self): diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 6377fb0..92b8113 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -172,7 +172,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor): PhilomenaExtractor.__init__(self, match) groups = match.groups() if groups[-1]: - q = groups[-1] + q = groups[-1].replace("+", " ") for old, new in ( ("-colon-" , ":"), ("-dash-" , "-"), diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 55c963d..f7809de 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,7 +19,7 @@ class RedditExtractor(Extractor): directory_fmt = ("{category}", "{subreddit}") filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" - cookiedomain = None + cookiedomain = ".reddit.com" def items(self): self.api = RedditAPI(self) @@ -301,6 +301,12 @@ class RedditAPI(): else: self.refresh_token = token + if not self.refresh_token: + # allow downloading from quarantined subreddits (#2180) + extractor._cookiejar.set( + "_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D', + domain=extractor.cookiedomain) + def submission(self, submission_id): """Fetch the (submission, comments)=-tuple for a submission id""" endpoint = "/comments/" + submission_id + "/.json" diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 199b1ba..9f4bfc3 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -197,7 +197,7 @@ class SexcomSearchExtractor(SexcomExtractor): subcategory = "search" directory_fmt = ("{category}", "search", "{search[query]}") pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:" - r"(pic|gif|video)s/([^/?#]+)|search/(pic|gif|video)s" + r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s" r")/?(?:\?([^#]+))?)") test = ( ("https://www.sex.com/search/pics?query=ecchi", { @@ -208,6 +208,7 @@ class SexcomSearchExtractor(SexcomExtractor): "range": "1-10", "count": 10, }), + ("https://www.sex.com/pics/?sort=popular&sub=all&page=1"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a49f1f2..f924292 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -39,7 +39,7 @@ class TwitterExtractor(Extractor): self.pinned = self.config("pinned", False) self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) - self.cards = self.config("cards", False) + self.cards = self.config("cards", True) self._user_cache = {} self._init_sizes() @@ -56,32 +56,39 @@ class TwitterExtractor(Extractor): def items(self): self.login() + self.api = TwitterAPI(self) metadata = self.metadata() for tweet in self.tweets(): - if not self.retweets and "retweeted_status_id_str" in tweet: - self.log.debug("Skipping %s (retweet)", tweet["id_str"]) + if "legacy" in tweet: + data = tweet["legacy"] + else: + data = tweet + + if not self.retweets and "retweeted_status_id_str" in data: + self.log.debug("Skipping %s (retweet)", data["id_str"]) continue - if not self.quoted and "quoted_by_id_str" in tweet: - self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"]) + if not self.quoted and "quoted_by_id_str" in data: + self.log.debug("Skipping %s (quoted tweet)", data["id_str"]) continue - if "in_reply_to_user_id_str" in tweet and ( + if "in_reply_to_user_id_str" in data and ( not self.replies or ( self.replies == "self" and - tweet["in_reply_to_user_id_str"] != tweet["user_id_str"] + data["in_reply_to_user_id_str"] != data["user_id_str"] ) ): - self.log.debug("Skipping %s (reply)", tweet["id_str"]) + self.log.debug("Skipping %s (reply)", data["id_str"]) continue files = [] - if "extended_entities" in tweet: - self._extract_media(tweet, files) + if "extended_entities" in data: + self._extract_media( + data, data["extended_entities"]["media"], files) if "card" in tweet and self.cards: self._extract_card(tweet, files) if self.twitpic: - self._extract_twitpic(tweet, files) + self._extract_twitpic(data, files) if not files and not self.textonly: continue @@ -95,8 +102,8 @@ class TwitterExtractor(Extractor): text.nameext_from_url(url, file) yield Message.Url, url, file - def _extract_media(self, tweet, files): - for media in tweet["extended_entities"]["media"]: + def _extract_media(self, tweet, entities, files): + for media in entities: width = media["original_info"].get("width", 0) height = media["original_info"].get("height", 0) @@ -142,8 +149,17 @@ class TwitterExtractor(Extractor): def _extract_card(self, tweet, files): card = tweet["card"] - if card["name"] in ("summary", "summary_large_image"): + if "legacy" in card: + card = card["legacy"] + name = card["name"] + + if name in ("summary", "summary_large_image"): bvals = card["binding_values"] + if isinstance(bvals, list): + bvals = { + bval["key"]: bval["value"] + for bval in card["binding_values"] + } for prefix in ("photo_image_full_size_", "summary_photo_image_", "thumbnail_image_"): @@ -154,8 +170,24 @@ class TwitterExtractor(Extractor): if value and "url" in value: files.append(value) return - elif self.videos: - url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"]) + elif name == "unified_card": + bvals = card["binding_values"] + if isinstance(bvals, list): + for bval in card["binding_values"]: + if bval["key"] == "unified_card": + bval = bval["value"]["string_value"] + break + else: + bval = bvals["unified_card"]["string_value"] + data = json.loads(bval) + if data.get("type") == "image_carousel_website": + self._extract_media( + tweet, data["media_entities"].values(), files) + return + + if self.cards == "ytdl": + tweet_id = tweet.get("rest_id") or tweet["id_str"] + url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id) files.append({"url": url}) def _extract_twitpic(self, tweet, files): @@ -171,6 +203,15 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _transform_tweet(self, tweet): + if "core" in tweet: + user = self._transform_user( + tweet["core"]["user_results"]["result"]) + else: + user = self._transform_user(tweet["user"]) + + if "legacy" in tweet: + tweet = tweet["legacy"] + entities = tweet["entities"] tdata = { "tweet_id" : text.parse_int(tweet["id_str"]), @@ -182,7 +223,7 @@ class TwitterExtractor(Extractor): tweet.get("in_reply_to_status_id_str")), "date" : text.parse_datetime( tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), - "user" : self._transform_user(tweet["user"]), + "user" : user, "lang" : tweet["lang"], "favorite_count": tweet["favorite_count"], "quote_count" : tweet["quote_count"], @@ -224,11 +265,13 @@ class TwitterExtractor(Extractor): def _transform_user(self, user): try: - return self._user_cache[user["id_str"]] + return self._user_cache[user.get("rest_id") or user["id_str"]] except KeyError: pass - uid = user["id_str"] + uid = user.get("rest_id") or user["id_str"] + if "legacy" in user: + user = user["legacy"] entities = user["entities"] self._user_cache[uid] = udata = { @@ -340,6 +383,10 @@ class TwitterTimelineExtractor(TwitterExtractor): "range": "1-40", "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", }), + # suspended account (#2216) + ("https://twitter.com/realDonaldTrump", { + "exception": exception.NotFoundError, + }), ("https://mobile.twitter.com/supernaturepics?p=i"), ("https://www.twitter.com/id:2976459548"), ("https://twitter.com/i/user/2976459548"), @@ -353,7 +400,7 @@ class TwitterTimelineExtractor(TwitterExtractor): self.user = "id:" + user_id def tweets(self): - return TwitterAPI(self).timeline_profile(self.user) + return self.api.user_tweets(self.user) class TwitterRepliesExtractor(TwitterExtractor): @@ -370,7 +417,7 @@ class TwitterRepliesExtractor(TwitterExtractor): ) def tweets(self): - return TwitterAPI(self).timeline_profile(self.user, replies=True) + return self.api.user_tweets_and_replies(self.user) class TwitterMediaExtractor(TwitterExtractor): @@ -387,7 +434,7 @@ class TwitterMediaExtractor(TwitterExtractor): ) def tweets(self): - return TwitterAPI(self).timeline_media(self.user) + return self.api.user_media(self.user) class TwitterLikesExtractor(TwitterExtractor): @@ -400,7 +447,7 @@ class TwitterLikesExtractor(TwitterExtractor): return {"user_likes": self.user} def tweets(self): - return TwitterAPI(self).timeline_favorites(self.user) + return self.api.user_likes(self.user) class TwitterBookmarkExtractor(TwitterExtractor): @@ -410,7 +457,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): test = ("https://twitter.com/i/bookmarks",) def tweets(self): - return TwitterAPI(self).timeline_bookmark() + return self.api.user_bookmarks() class TwitterListExtractor(TwitterExtractor): @@ -424,7 +471,7 @@ class TwitterListExtractor(TwitterExtractor): }) def tweets(self): - return TwitterAPI(self).timeline_list(self.user) + return self.api.list_latest_tweets_timeline(self.user) class TwitterListMembersExtractor(TwitterExtractor): @@ -453,7 +500,7 @@ class TwitterFollowingExtractor(TwitterExtractor): class TwitterSearchExtractor(TwitterExtractor): - """Extractor for all images from a search timeline""" + """Extractor for Twitter search results""" subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" test = ("https://twitter.com/search?q=nature", { @@ -466,7 +513,25 @@ class TwitterSearchExtractor(TwitterExtractor): return {"search": text.unquote(self.user)} def tweets(self): - return TwitterAPI(self).search(text.unquote(self.user)) + return self.api.search_adaptive(text.unquote(self.user)) + + +class TwitterEventExtractor(TwitterExtractor): + """Extractor for Tweets from a Twitter Event""" + subcategory = "event" + directory_fmt = ("{category}", "Events", + "{event[id]} {event[short_title]}") + pattern = BASE_PATTERN + r"/i/events/(\d+)" + test = ("https://twitter.com/i/events/1484669206993903616", { + "range": "1-20", + "count": ">5", + }) + + def metadata(self): + return {"event": self.api.live_event(self.user)} + + def tweets(self): + return self.api.live_event_timeline(self.user) class TwitterTweetExtractor(TwitterExtractor): @@ -531,7 +596,7 @@ class TwitterTweetExtractor(TwitterExtractor): }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { - "options": (("twitpic", True),), + "options": (("twitpic", True), ("cards", False)), "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", "count": 3, }), @@ -545,6 +610,16 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("cards", True),), "pattern": r"https://pbs.twimg.com/card_img/\d+/", }), + # unified_card with image_carousel_website + ("https://twitter.com/doax_vv_staff/status/1479438945662685184", { + "options": (("cards", True),), + "pattern": r"https://pbs\.twimg\.com/media/F.+=png", + "count": 6, + }), + # unified_card without type + ("https://twitter.com/i/web/status/1466183847628865544", { + "count": 0, + }), # original retweets (#1026) ("https://twitter.com/jessica_3978/status/1296304589591810048", { "options": (("retweets", "original"),), @@ -565,6 +640,10 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("retweets", True),), "count": 4, }), + # deleted quote tweet (#2225) + ("https://twitter.com/i/web/status/1460044411165888515", { + "count": 0, + }), ) def __init__(self, match): @@ -573,8 +652,19 @@ class TwitterTweetExtractor(TwitterExtractor): def tweets(self): if self.config("conversations", False): - return TwitterAPI(self).conversation(self.tweet_id) - return TwitterAPI(self).tweet(self.tweet_id) + return self.api.tweet_detail(self.tweet_id) + + tweets = [] + tweet_id = self.tweet_id + for tweet in self.api.tweet_detail(tweet_id): + if tweet["rest_id"] == tweet_id or \ + tweet.get("_retweet_id_str") == tweet_id: + tweets.append(tweet) + + tweet_id = tweet["legacy"].get("quoted_status_id_str") + if not tweet_id: + break + return tweets class TwitterImageExtractor(Extractor): @@ -634,6 +724,7 @@ class TwitterAPI(): "include_mute_edge": "1", "include_can_dm": "1", "include_can_media_tag": "1", + "include_ext_has_nft_avatar": "1", "skip_status": "1", "cards_platform": "Web-12", "include_cards": "1", @@ -645,12 +736,30 @@ class TwitterAPI(): "include_user_entities": "true", "include_ext_media_color": "true", "include_ext_media_availability": "true", + "include_ext_sensitive_media_warning": "true", "send_error_codes": "true", "simple_quoted_tweet": "true", "count": "100", "cursor": None, - "ext": "mediaStats,highlightedLabel", + "ext": "mediaStats,highlightedLabel,hasNftAvatar," + "voiceInfo,superFollowMetadata", + } + self.variables = { + "includePromotedContent": False, + "withSuperFollowsUserFields": True, + "withBirdwatchPivots": False, + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + "withSuperFollowsTweetFields": True, + "withClientEventToken": False, + "withBirdwatchNotes": False, + "withVoice": True, + "withV2Timeline": False, + "__fs_interactive_text": False, + "__fs_dont_mention_me_view_api_enabled": False, } + self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode cookies = extractor.session.cookies cookiedomain = extractor.cookiedomain @@ -671,54 +780,70 @@ class TwitterAPI(): cookies.set("gt", guest_token, domain=cookiedomain) self.headers["x-guest-token"] = guest_token - def tweet(self, tweet_id): - endpoint = "/2/timeline/conversation/{}.json".format(tweet_id) - tweets = [] - for tweet in self._pagination(endpoint): - if tweet["id_str"] == tweet_id or \ - tweet.get("_retweet_id_str") == tweet_id: - tweets.append(tweet) - if "quoted_status_id_str" in tweet: - tweet_id = tweet["quoted_status_id_str"] - else: - break - return tweets + def tweet_detail(self, tweet_id): + endpoint = "/graphql/aD0-HB47XIOxiBl5kTkX5Q/TweetDetail" + variables = { + "focalTweetId": tweet_id, + "with_rux_injections": False, + "withCommunity": True, + "withQuickPromoteEligibilityTweetFields": True, + "withBirdwatchNotes": False, + } + return self._pagination_tweets( + endpoint, variables, ("threaded_conversation_with_injections",)) - def conversation(self, conversation_id): - endpoint = "/2/timeline/conversation/{}.json".format(conversation_id) - return self._pagination(endpoint) + def user_tweets(self, screen_name): + endpoint = "/graphql/LNhjy8t3XpIrBYM-ms7sPQ/UserTweets" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + "withQuickPromoteEligibilityTweetFields": True, + } + return self._pagination_tweets(endpoint, variables) - def timeline_profile(self, screen_name, replies=False): - user_id = self._user_id_by_screen_name(screen_name) - endpoint = "/2/timeline/profile/{}.json".format(user_id) - params = self.params.copy() - params["include_tweet_replies"] = "true" if replies else "false" - return self._pagination(endpoint, params) + def user_tweets_and_replies(self, screen_name): + endpoint = "/graphql/Vg5aF036K40ST3FWvnvRGA/UserTweetsAndReplies" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + "withCommunity": True, + } + return self._pagination_tweets(endpoint, variables) - def timeline_media(self, screen_name): - user_id = self._user_id_by_screen_name(screen_name) - endpoint = "/2/timeline/media/{}.json".format(user_id) - return self._pagination(endpoint) + def user_media(self, screen_name): + endpoint = "/graphql/Hl6C7ac051l_QBe3HjGz_A/UserMedia" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + } + return self._pagination_tweets(endpoint, variables) - def timeline_favorites(self, screen_name): - user_id = self._user_id_by_screen_name(screen_name) - endpoint = "/2/timeline/favorites/{}.json".format(user_id) - params = self.params.copy() - params["sorted_by_time"] = "true" - return self._pagination(endpoint) + def user_likes(self, screen_name): + endpoint = "/graphql/smISlRVSnz-GaU_XpU_akw/Likes" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count": 100, + } + return self._pagination_tweets(endpoint, variables) - def timeline_bookmark(self): - endpoint = "/2/timeline/bookmark.json" - return self._pagination(endpoint) + def user_bookmarks(self): + endpoint = "/graphql/yKNebSjZKbo2tOd-Qdc7Xg/Bookmarks" + variables = { + "count": 100, + } + return self._pagination_tweets( + endpoint, variables, ("bookmark_timeline", "timeline")) - def timeline_list(self, list_id): - endpoint = "/2/timeline/list.json" - params = self.params.copy() - params["list_id"] = list_id - params["ranking_mode"] = "reverse_chronological" - return self._pagination(endpoint, params) + def list_latest_tweets_timeline(self, list_id): + endpoint = "/graphql/RxUL5UHi4Msxt_P9O1729w/ListLatestTweetsTimeline" + variables = { + "listId": list_id, + "count": 100, + } + return self._pagination_tweets( + endpoint, variables, ("list", "tweets_timeline", "timeline")) - def search(self, query): + def search_adaptive(self, query): endpoint = "/2/search/adaptive.json" params = self.params.copy() params["q"] = query @@ -726,55 +851,77 @@ class TwitterAPI(): params["query_source"] = "typed_query" params["pc"] = "1" params["spelling_corrections"] = "1" - return self._pagination(endpoint, params) + return self._pagination_legacy(endpoint, params) + + def live_event_timeline(self, event_id): + endpoint = "/2/live_event/timeline/{}.json".format(event_id) + params = self.params.copy() + params["timeline_id"] = "recap" + params["urt"] = "true" + params["get_annotations"] = "true" + return self._pagination_legacy(endpoint, params) + + def live_event(self, event_id): + endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id) + params = self.params.copy() + params["count"] = "0" + params["urt"] = "true" + return (self._call(endpoint, params) + ["twitter_objects"]["live_events"][event_id]) def list_by_rest_id(self, list_id): - endpoint = "/graphql/18MAHTcDU-TdJSjWWmoH7w/ListByRestId" - params = {"variables": '{"listId":"' + list_id + '"' - ',"withUserResult":false}'} + endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId" + params = {"variables": self._json_dumps({ + "listId": list_id, + "withSuperFollowsUserFields": True, + })} try: return self._call(endpoint, params)["data"]["list"] except KeyError: raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/tA7h9hy4U0Yc9COfIOh3qQ/ListMembers" + endpoint = "/graphql/kk9RQtSa2sc-4_9figZVBw/ListMembers" variables = { "listId": list_id, - "count" : 100, - "withTweetResult": False, - "withUserResult" : False, + "count": 100, + "withSafetyModeUserFields": True, } - return self._pagination_graphql( - endpoint, variables, "list", "members_timeline") + return self._pagination_users( + endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/Q_QTiPvoXwsA13eoA7okIQ/Following" + endpoint = "/graphql/kz464_e4MAOXc3bGOA9kow/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count" : 100, - "withTweetResult": False, - "withUserResult" : False, - "withTweetQuoteCount" : False, - "withHighlightedLabel" : False, - "includePromotedContent": False, + "count": 100, } - return self._pagination_graphql( - endpoint, variables, "user", "following_timeline") + return self._pagination_users(endpoint, variables) def user_by_screen_name(self, screen_name): - endpoint = "/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName" - params = {"variables": '{"screen_name":"' + screen_name + '"' - ',"withHighlightedLabel":true}'} - try: - return self._call(endpoint, params)["data"]["user"] - except KeyError: - raise exception.NotFoundError("user") + endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName" + params = {"variables": self._json_dumps({ + "screen_name": screen_name, + "withSafetyModeUserFields": True, + "withSuperFollowsUserFields": True, + })} + return self._call(endpoint, params)["data"]["user"]["result"] def _user_id_by_screen_name(self, screen_name): if screen_name.startswith("id:"): return screen_name[3:] - return self.user_by_screen_name(screen_name)["rest_id"] + + user = () + try: + user = self.user_by_screen_name(screen_name) + return user["rest_id"] + except KeyError: + if "unavailable_message" in user: + raise exception.NotFoundError("{} ({})".format( + user["unavailable_message"].get("text"), + user.get("reason")), False) + else: + raise exception.NotFoundError("user") @cache(maxage=3600) def _guest_token(self): @@ -782,7 +929,7 @@ class TwitterAPI(): endpoint = "/1.1/guest/activate.json" return str(self._call(endpoint, None, root, "POST")["guest_token"]) - def _call(self, endpoint, params, root=None, method="GET"): + def _call(self, endpoint, params, root=None, method="GET", warning=True): if root is None: root = self.root @@ -799,24 +946,16 @@ class TwitterAPI(): data = response.json() if "errors" in data: try: - errors, warnings = [], [] - for error in data["errors"]: - if error.get("kind") == "NonFatal": - warnings.append(error["message"]) - else: - errors.append(error["message"]) - errors = ", ".join(errors) + errors = ", ".join(e["message"] for e in data["errors"]) except Exception: errors = data["errors"] - if warnings: - self.extractor.log.warning(", ".join(warnings)) - if errors and response.status_code < 400: - raise exception.StopExtraction(errors) else: errors = "" if response.status_code < 400: # success + if errors and warning: + self.extractor.log.warning(errors) return data if response.status_code == 429: @@ -846,11 +985,8 @@ class TwitterAPI(): raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, errors) - def _pagination(self, endpoint, params=None): - if params is None: - params = self.params.copy() + def _pagination_legacy(self, endpoint, params): original_retweets = (self.extractor.retweets == "original") - pinned_tweet = self.extractor.pinned while True: cursor = tweet = None @@ -863,12 +999,6 @@ class TwitterAPI(): tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] - if pinned_tweet: - if "pinEntry" in instr[-1]: - tweet_ids.append(instr[-1]["pinEntry"]["entry"]["content"] - ["item"]["content"]["tweet"]["id"]) - pinned_tweet = False - # collect tweet IDs and cursor value for entry in instr[0]["addEntries"]["entries"]: entry_startswith = entry["entryId"].startswith @@ -884,7 +1014,7 @@ class TwitterAPI(): elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")): cursor = entry["content"]["operation"]["cursor"] - if not cursor.get("stopOnEmptyResponse"): + if not cursor.get("stopOnEmptyResponse", True): # keep going even if there are no tweets tweet = True cursor = cursor["value"] @@ -939,23 +1069,133 @@ class TwitterAPI(): return params["cursor"] = cursor - def _pagination_graphql(self, endpoint, variables, key, timeline): + def _pagination_tweets(self, endpoint, variables, path=None): + variables.update(self.variables) + original_retweets = (self.extractor.retweets == "original") + pinned_tweet = self.extractor.pinned + + while True: + params = {"variables": self._json_dumps(variables)} + data = self._call(endpoint, params)["data"] + + try: + if path is None: + instructions = (data["user"]["result"]["timeline"] + ["timeline"]["instructions"]) + else: + for key in path: + data = data[key] + instructions = data["instructions"] + + entries = instructions[0]["entries"] + except (KeyError, IndexError): + return + + tweets = [] + tweet = cursor = None + + if pinned_tweet: + pinned_tweet = False + if instructions[-1]["type"] == "TimelinePinEntry": + tweets.append(instructions[-1]["entry"]) + + for entry in entries: + esw = entry["entryId"].startswith + + if esw("tweet-"): + tweets.append(entry) + elif esw("homeConversation-"): + tweets.extend(entry["content"]["items"]) + elif esw("conversationthread-"): + tweets.extend(entry["content"]["items"]) + elif esw("cursor-bottom-"): + cursor = entry["content"] + if not cursor.get("stopOnEmptyResponse", True): + # keep going even if there are no tweets + tweet = True + cursor = cursor.get("value") + + for entry in tweets: + try: + tweet = ((entry.get("content") or entry["item"]) + ["itemContent"]["tweet_results"]["result"]) + legacy = tweet["legacy"] + except KeyError: + self.extractor.log.debug( + "Skipping %s (deleted)", + (entry.get("entryId") or "").rpartition("-")[2]) + continue + + if "retweeted_status_result" in legacy: + retweet = legacy["retweeted_status_result"]["result"] + if original_retweets: + try: + retweet["legacy"]["retweeted_status_id_str"] = \ + retweet["rest_id"] + retweet["_retweet_id_str"] = tweet["rest_id"] + tweet = retweet + except KeyError: + continue + else: + try: + legacy["retweeted_status_id_str"] = \ + retweet["rest_id"] + legacy["author"] = \ + retweet["core"]["user_results"]["result"] + if "extended_entities" in retweet["legacy"] and \ + "extended_entities" not in legacy: + legacy["extended_entities"] = \ + retweet["legacy"]["extended_entities"] + except KeyError: + pass + + yield tweet + + if "quoted_status_result" in tweet: + try: + quoted = tweet["quoted_status_result"]["result"] + quoted["legacy"]["author"] = \ + quoted["core"]["user_results"]["result"] + quoted["core"] = tweet["core"] + quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] + yield quoted + except KeyError: + self.extractor.log.debug( + "Skipping quote of %s (deleted)", + tweet.get("rest_id")) + continue + + if not tweet or not cursor: + return + variables["cursor"] = cursor + + def _pagination_users(self, endpoint, variables, path=None): + variables.update(self.variables) + while True: cursor = entry = stop = None - params = {"variables": json.dumps(variables)} - data = self._call(endpoint, params) + params = {"variables": self._json_dumps(variables)} + data = self._call(endpoint, params)["data"] try: - instructions = \ - data["data"][key][timeline]["timeline"]["instructions"] + if path is None: + instructions = (data["user"]["result"]["timeline"] + ["timeline"]["instructions"]) + else: + for key in path: + data = data[key] + instructions = data["instructions"] except KeyError: - raise exception.AuthorizationError() + return for instr in instructions: if instr["type"] == "TimelineAddEntries": for entry in instr["entries"]: if entry["entryId"].startswith("user-"): - yield entry["content"]["itemContent"]["user"] + user = (entry["content"]["itemContent"] + ["user_results"]["result"]) + if "rest_id" in user: + yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] elif instr["type"] == "TimelineTerminateTimeline": diff --git a/gallery_dl/option.py b/gallery_dl/option.py index cdfe9a1..e1ada09 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -124,6 +124,11 @@ def build_parser(): help="Use the specified proxy", ) general.add_argument( + "--source-address", + dest="source-address", metavar="IP", action=ConfigAction, + help="Client-side IP address to bind to", + ) + general.add_argument( "--clear-cache", dest="clear_cache", metavar="MODULE", help="Delete cached login sessions, cookies, etc. for MODULE " diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 8fa7c22..1a399fa 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.20.1" +__version__ = "1.20.3" |
