diff options
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 582 |
1 files changed, 303 insertions, 279 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4c7b757..7cabb8c 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -10,9 +10,9 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache, memcache -import json -import re +from ..cache import cache +import hashlib +import time class TwitterExtractor(Extractor): @@ -24,23 +24,15 @@ class TwitterExtractor(Extractor): cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") - user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; " - "Trident/7.0; rv:11.0) like Gecko") def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) - self._user_dict = None - self.logged_in = False self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) - self.content = self.config("content", False) self.videos = self.config("videos", True) - - if self.content: - self._emoji_sub = re.compile( - r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub + self._user_cache = {} def items(self): self.login() @@ -48,235 +40,194 @@ class TwitterExtractor(Extractor): yield Message.Version, 1 for tweet in self.tweets(): - data = self._data_from_tweet(tweet) - if not data or \ - not self.retweets and data["retweet_id"] or \ - not self.replies and data["reply"]: + + if not self.retweets and "retweeted_status_id_str" in tweet or \ + not self.replies and "in_reply_to_user_id_str" in tweet: continue - data.update(metadata) - if self.videos and "-videoContainer" in tweet: - yield Message.Directory, data + if self.twitpic: + self._extract_twitpic(tweet) + if "extended_entities" not in tweet: + continue - if self.videos == "ytdl": - data["extension"] = None - url = "ytdl:{}/i/web/status/{}".format( - self.root, data["tweet_id"]) - else: - url = self._video_from_tweet(data["tweet_id"]) - if not url: - continue - text.nameext_from_url(url, data) - if data["extension"] == "m3u8": - url = "ytdl:" + url - data["extension"] = "mp4" - data["_ytdl_extra"] = {"protocol": "m3u8_native"} - data["num"] = 1 - yield Message.Url, url, data - - elif "data-image-url=" in tweet: - yield Message.Directory, data - - images = text.extract_iter( - tweet, 'data-image-url="', '"') - for data["num"], url in enumerate(images, 1): - text.nameext_from_url(url, data) + tdata = self._transform_tweet(tweet) + tdata.update(metadata) + + yield Message.Directory, tdata + for tdata["num"], media in enumerate( + tweet["extended_entities"]["media"], 1): + + tdata["width"] = media["original_info"].get("width", 0) + tdata["height"] = media["original_info"].get("height", 0) + + if "video_info" in media and self.videos: + + if self.videos == "ytdl": + url = "ytdl:{}/i/web/status/{}".format( + self.root, tweet["id_str"]) + tdata["extension"] = None + yield Message.Url, url, tdata + + else: + video_info = media["video_info"] + variant = max( + video_info["variants"], + key=lambda v: v.get("bitrate", 0), + ) + tdata["duration"] = video_info.get( + "duration_millis", 0) / 1000 + tdata["bitrate"] = variant.get("bitrate", 0) + + url = variant["url"] + text.nameext_from_url(url, tdata) + yield Message.Url, url, tdata + + elif "media_url_https" in media: + url = media["media_url_https"] urls = [url + size for size in self.sizes] - yield Message.Urllist, urls, data - - if self.twitpic and "//twitpic.com/" in tweet: - urls = [ - url for url in text.extract_iter( - tweet, 'data-expanded-url="', '"') - if "//twitpic.com/" in url - ] - - if "num" not in data: - if urls: - yield Message.Directory, data - data["num"] = 0 - - for data["num"], url in enumerate(urls, data["num"]+1): - response = self.request(url, fatal=False) - if response.status_code >= 400: - continue - url = text.extract( - response.text, 'name="twitter:image" value="', '"')[0] - yield Message.Url, url, text.nameext_from_url(url, data) + text.nameext_from_url(url, tdata) + yield Message.Urllist, urls, tdata + + else: + url = media["media_url"] + text.nameext_from_url(url, tdata) + yield Message.Url, url, tdata + + def _extract_twitpic(self, tweet): + twitpics = [] + for url in tweet["entities"].get("urls", ()): + url = url["expanded_url"] + if "//twitpic.com/" in url: + response = self.request(url, fatal=False) + if response.status_code >= 400: + continue + url = text.extract( + response.text, 'name="twitter:image" value="', '"')[0] + twitpics.append({ + "original_info": {}, + "media_url" : url, + }) + if twitpics: + if "extended_entities" in tweet: + tweet["extended_entities"]["media"].extend(twitpics) + else: + tweet["extended_entities"] = {"media": twitpics} + + def _transform_tweet(self, tweet): + entities = tweet["entities"] + tdata = { + "tweet_id" : text.parse_int(tweet["id_str"]), + "retweet_id" : text.parse_int( + tweet.get("retweeted_status_id_str")), + "quote_id" : text.parse_int( + tweet.get("quoted_status_id_str")), + "reply_id" : text.parse_int( + tweet.get("in_reply_to_status_id_str")), + "date" : text.parse_datetime( + tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "user" : self._transform_user(tweet["user"]), + "lang" : tweet["lang"], + "content" : tweet["full_text"], + "favorite_count": tweet["favorite_count"], + "quote_count" : tweet["quote_count"], + "reply_count" : tweet["reply_count"], + "retweet_count" : tweet["retweet_count"], + } + + hashtags = entities.get("hashtags") + if hashtags: + tdata["hashtags"] = [t["text"] for t in hashtags] + + mentions = entities.get("user_mentions") + if mentions: + tdata["mentions"] = [{ + "id": text.parse_int(u["id_str"]), + "name": u["screen_name"], + "nick": u["name"], + } for u in mentions] + + if "in_reply_to_screen_name" in tweet: + tdata["reply_to"] = tweet["in_reply_to_screen_name"] + + if "full_text_quoted" in tweet: + tdata["content_quoted"] = tweet["full_text_quoted"] + + if "author" in tweet: + tdata["author"] = self._transform_user(tweet["author"]) + + return tdata + + def _transform_user(self, user): + uid = user["id_str"] + cache = self._user_cache + + if uid not in cache: + cache[uid] = { + "id" : text.parse_int(uid), + "name" : user["screen_name"], + "nick" : user["name"], + "description" : user["description"], + "location" : user["location"], + "date" : text.parse_datetime( + user["created_at"], "%a %b %d %H:%M:%S %z %Y"), + "verified" : user.get("verified", False), + "profile_banner" : user.get("profile_banner_url", ""), + "profile_image" : user.get( + "profile_image_url_https", "").replace("_normal.", "."), + "favourites_count": user["favourites_count"], + "followers_count" : user["followers_count"], + "friends_count" : user["friends_count"], + "listed_count" : user["listed_count"], + "media_count" : user["media_count"], + "statuses_count" : user["statuses_count"], + } + return cache[uid] def metadata(self): """Return general metadata""" return {} def tweets(self): - """Yield HTML content of all relevant tweets""" + """Yield all relevant tweet objects""" def login(self): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) - self.logged_in = True @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - headers = {"User-Agent": self.user_agent} - page = self.request(self.root + "/login", headers=headers).text + url = "https://mobile.twitter.com/i/nojs_router" + params = {"path": "/login"} + headers = {"Referer": self.root + "/", "Origin": self.root} + page = self.request( + url, method="POST", params=params, headers=headers, data={}).text + pos = page.index('name="authenticity_token"') - token = text.extract(page, 'value="', '"', pos-80)[0] + token = text.extract(page, 'value="', '"', pos)[0] - url = self.root + "/sessions" + url = "https://mobile.twitter.com/sessions" data = { + "authenticity_token" : token, "session[username_or_email]": username, "session[password]" : password, - "authenticity_token" : token, - "ui_metrics" : '{"rf":{},"s":""}', - "scribe_log" : "", - "redirect_after_login" : "", "remember_me" : "1", + "wfa" : "1", + "commit" : "+Log+in+", + "ui_metrics" : "", } - response = self.request(url, method="POST", headers=headers, data=data) - if "/error" in response.url: - raise exception.AuthenticationError() - - return { + response = self.request(url, method="POST", data=data) + cookies = { cookie.name: cookie.value for cookie in self.session.cookies - if cookie.domain and "twitter.com" in cookie.domain - } - - def _data_from_tweet(self, tweet): - extr = text.extract_from(tweet) - data = { - "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), - "reply" : bool(extr('data-is-reply-to="' , '"')), - "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), - "retweeter" : extr('data-retweeter="' , '"'), - "author" : { - "name" : extr('data-screen-name="', '"'), - "nick" : text.unescape(extr('data-name="' , '"')), - "id" : text.parse_int(extr('data-user-id="' , '"')), - }, - } - - if not self._user_dict: - if data["retweet_id"]: - for user in json.loads(text.unescape(extr( - 'data-reply-to-users-json="', '"'))): - if user["screen_name"] == data["retweeter"]: - break - else: - self.log.warning("Unable to extract user info") - return None - self._user_dict = { - "name": user["screen_name"], - "nick": text.unescape(user["name"]), - "id" : text.parse_int(user["id_str"]), - } - else: - self._user_dict = data["author"] - - data["user"] = self._user_dict - data["date"] = text.parse_timestamp(extr('data-time="', '"')) - - if self.content: - content = extr('<div class="js-tweet-text-container">', '\n</div>') - if '<img class="Emoji ' in content: - content = self._emoji_sub(r"\1", content) - content = text.unescape(text.remove_html(content, "", "")) - cl, _, cr = content.rpartition("pic.twitter.com/") - data["content"] = cl if cl and len(cr) < 16 else content - - if extr('<div class="QuoteTweet', '>'): - data["retweet_id"] = text.parse_int(extr('data-item-id="', '"')) - data["retweeter"] = data["user"]["name"] - data["author"] = { - "name" : extr('data-screen-name="', '"'), - "id" : text.parse_int(extr('data-user-id="' , '"')), - "nick" : text.unescape(extr( - 'QuoteTweet-fullname', '<').partition('>')[2]), - } - - return data - - def _video_from_tweet(self, tweet_id): - url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format( - tweet_id) - cookies = None - headers = { - "Origin" : self.root, - "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id), - "x-csrf-token" : self.session.cookies.get("ct0"), - "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM" - "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N" - "HfOPqkca3qaAxGfsyKCs0wRbw", - } - - if self.logged_in: - headers["x-twitter-auth-type"] = "OAuth2Session" - else: - token = _guest_token(self, headers) - cookies = {"gt": token} - headers["x-guest-token"] = token - - response = self.request( - url, cookies=cookies, headers=headers, fatal=None) - - if response.status_code == 429 or \ - response.headers.get("x-rate-limit-remaining") == "0": - if self.logged_in: - self.wait(until=response.headers.get("x-rate-limit-reset")) - else: - _guest_token.invalidate() - return self._video_from_tweet(tweet_id) - - elif response.status_code >= 400: - self.log.warning("Unable to fetch video data for %s ('%s %s')", - tweet_id, response.status_code, response.reason) - return None - - return response.json()["track"]["playbackUrl"] - - def _tweets_from_api(self, url, max_position=None): - params = { - "include_available_features": "1", - "include_entities": "1", - "max_position": max_position, - "reset_error_state": "false", - "lang": "en", + if cookie.domain == self.cookiedomain } - headers = { - "X-Requested-With": "XMLHttpRequest", - "X-Twitter-Active-User": "yes", - "Referer": self.root + "/", - } - - while True: - data = self.request(url, params=params, headers=headers).json() - if "inner" in data: - data = data["inner"] - - for tweet in text.extract_iter( - data["items_html"], '<div class="tweet ', '\n</li>'): - yield tweet - if data.get("min_position") is None: - if data["has_more_items"] and "min_position" not in data: - pass - else: - return - - if "min_position" in data: - position = data["min_position"] - if position == max_position or position is None: - return - else: - position = text.parse_int(text.extract( - tweet, 'data-tweet-id="', '"')[0]) - if max_position and position >= max_position: - return - params["max_position"] = max_position = position + if "/error" in response.url or "auth_token" not in cookies: + raise exception.AuthenticationError() + return cookies class TwitterTimelineExtractor(TwitterExtractor): @@ -288,15 +239,12 @@ class TwitterTimelineExtractor(TwitterExtractor): ("https://twitter.com/supernaturepics", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", - "keyword": "4a3d28cc9f7a39e27333d56f3fe19e6e07ee979e", }), ("https://mobile.twitter.com/supernaturepics?p=i"), ) def tweets(self): - url = "{}/i/profiles/show/{}/timeline/tweets".format( - self.root, self.user) - return self._tweets_from_api(url) + return TwitterAPI(self).timeline_profile(self.user) class TwitterMediaExtractor(TwitterExtractor): @@ -313,9 +261,7 @@ class TwitterMediaExtractor(TwitterExtractor): ) def tweets(self): - url = "{}/i/profiles/show/{}/media_timeline".format( - self.root, self.user) - return self._tweets_from_api(url) + return TwitterAPI(self).timeline_media(self.user) class TwitterSearchExtractor(TwitterExtractor): @@ -330,12 +276,10 @@ class TwitterSearchExtractor(TwitterExtractor): }) def metadata(self): - return {"search": self.user} + return {"search": text.unquote(self.user)} def tweets(self): - url = "{}/i/search/timeline?f=tweets&q={}".format( - self.root, self.user) - return self._tweets_from_api(url, "-1") + return TwitterAPI(self).search(self.user) class TwitterTweetExtractor(TwitterExtractor): @@ -346,22 +290,19 @@ class TwitterTweetExtractor(TwitterExtractor): test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", - "keyword": "76e018cf3f4c8b82d3bdd425e01e28078c98373b", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", }), # 4 images ("https://twitter.com/perrypumas/status/894001459754180609", { "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", - "keyword": "c9251b1fd79d547b0c6b4577f06c937d0e9b63d2", }), # video ("https://twitter.com/perrypumas/status/1065692031626829824", { "options": (("videos", True),), - "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", + "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5", }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/playpokemon/status/1263832915173048321", { - "options": (("content", True),), "keyword": {"content": ( r"re:Gear up for #PokemonSwordShieldEX with special Mystery " "Gifts! \n\nYou’ll be able to receive four Galarian form " @@ -386,10 +327,6 @@ class TwitterTweetExtractor(TwitterExtractor): # quoted tweet (#526) ("https://twitter.com/Pistachio/status/1222690391817932803", { "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", - "keyword": { - "author": {"name": "Afro_Herper", "id": 786047748508221440}, - "user" : {"name": "Pistachio" , "id": 3533231}, - }, }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { @@ -404,18 +341,7 @@ class TwitterTweetExtractor(TwitterExtractor): self.tweet_id = match.group(2) def tweets(self): - url = "{}/i/web/status/{}".format(self.root, self.tweet_id) - cookies = {"app_shell_visited": "1"} - headers = {"User-Agent": self.user_agent, "Referer": url} - - response = self.request(url, cookies=cookies, headers=headers) - if response.history and response.url == self.root + "/": - raise exception.AuthorizationError() - page = response.text - - end = page.index('class="js-tweet-stats-container') - beg = page.rindex('<div class="tweet ', 0, end) - return (page[beg:end],) + return TwitterAPI(self).tweet(self.tweet_id) class TwitterBookmarkExtractor(TwitterExtractor): @@ -424,15 +350,26 @@ class TwitterBookmarkExtractor(TwitterExtractor): pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" test = ("https://twitter.com/i/bookmarks",) - def items(self): - self.login() - if not self.logged_in: - raise exception.AuthorizationError("Login required") - for cookie in self.session.cookies: - cookie.expires = None + def tweets(self): + return TwitterAPI(self).bookmarks() - url = "https://api.twitter.com/2/timeline/bookmark.json" - params = { + +class TwitterAPI(): + + def __init__(self, extractor): + self.extractor = extractor + self.headers = { + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" + "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" + "4FA33AGWWjCpTnA", + "x-guest-token": None, + "x-twitter-client-language": "en", + "x-twitter-active-user": "yes", + "x-csrf-token": None, + "Origin": "https://twitter.com", + "Referer": "https://twitter.com/", + } + self.params = { "include_profile_interstitial_type": "1", "include_blocking": "1", "include_blocked_by": "1", @@ -453,47 +390,134 @@ class TwitterBookmarkExtractor(TwitterExtractor): "include_ext_media_color": "true", "include_ext_media_availability": "true", "send_error_codes": "true", - "simple_quoted_tweets": "true", + "simple_quoted_tweet": "true", + # "count": "20", "count": "100", "cursor": None, - "ext": "mediaStats%2CcameraMoment", + "ext": "mediaStats,highlightedLabel,cameraMoment", + "include_quote_count": "true", } - headers = { - "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" - "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" - "4FA33AGWWjCpTnA", - "Origin": self.root, - "Referer": self.root + "/i/bookmarks", - "x-csrf-token": self.session.cookies.get("ct0"), - "x-twitter-active-user": "yes", - "x-twitter-auth-type": "OAuth2Session", - "x-twitter-client-language": "en", + + cookies = self.extractor.session.cookies + + # CSRF + csrf = hashlib.md5(str(time.time()).encode()).hexdigest() + self.headers["x-csrf-token"] = csrf + cookies.set("ct0", csrf, domain=".twitter.com") + + if cookies.get("auth_token", domain=".twitter.com"): + self.headers["x-twitter-auth-type"] = "OAuth2Session" + else: + # guest token + guest_token = _guest_token(self.extractor, self.headers) + self.headers["x-guest-token"] = guest_token + cookies.set("gt", guest_token, domain=".twitter.com") + + def tweet(self, tweet_id): + endpoint = "2/timeline/conversation/{}.json".format(tweet_id) + for tweet in self._pagination(endpoint): + if tweet["id_str"] == tweet_id: + return (tweet,) + return () + + def timeline_profile(self, screen_name): + user = self.user_by_screen_name(screen_name) + endpoint = "2/timeline/profile/{}.json".format(user["rest_id"]) + return self._pagination(endpoint) + + def timeline_media(self, screen_name): + user = self.user_by_screen_name(screen_name) + endpoint = "2/timeline/media/{}.json".format(user["rest_id"]) + return self._pagination(endpoint) + + def search(self, query): + endpoint = "2/search/adaptive.json" + params = self.params.copy() + params["q"] = text.unquote(query) + return self._pagination( + endpoint, params, "sq-I-t-", "sq-cursor-bottom") + + def bookmarks(self): + endpoint = "2/timeline/bookmark.json" + return self._pagination(endpoint) + + def user_by_screen_name(self, screen_name): + endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName" + params = { + "variables": '{"screen_name":"' + screen_name + '"' + ',"withHighlightedLabel":true}' } + return self._call(endpoint, params)["data"]["user"] + + def _call(self, endpoint, params): + url = "https://api.twitter.com/" + endpoint + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + if response.status_code < 400: + return response.json() + if response.status_code == 429: + self.extractor.wait(until=response.headers["x-rate-limit-reset"]) + return self._call(endpoint, params) + raise exception.StopExtraction( + "%s %s (%s)", response.status_code, response.reason, response.text) + + def _pagination(self, endpoint, params=None, + entry_tweet="tweet-", entry_cursor="cursor-bottom-"): + if params is None: + params = self.params.copy() while True: - response = self.request( - url, params=params, headers=headers, fatal=False) - if response.status_code >= 400: - raise exception.StopExtraction(response.text) - data = response.json() - tweets = data["globalObjects"]["tweets"] + cursor = tweet = None + data = self._call(endpoint, params) - if not tweets: + instr = data["timeline"]["instructions"] + if not instr: return - for tweet_id, tweet_data in tweets.items(): - tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id) - tweet_data["_extractor"] = TwitterTweetExtractor - yield Message.Queue, tweet_url, tweet_data + tweets = data["globalObjects"]["tweets"] + users = data["globalObjects"]["users"] + + for entry in instr[0]["addEntries"]["entries"]: - inst = data["timeline"]["instructions"][0] - for entry in inst["addEntries"]["entries"]: - if entry["entryId"].startswith("cursor-bottom-"): - params["cursor"] = \ - entry["content"]["operation"]["cursor"]["value"] - break + if entry["entryId"].startswith(entry_tweet): + tid = entry["content"]["item"]["content"]["tweet"]["id"] + if tid not in tweets: + self.extractor.log.debug( + "Skipping unavailable Tweet %s", tid) + continue + tweet = tweets[tid] + tweet["user"] = users[tweet["user_id_str"]] + + if "quoted_status_id_str" in tweet: + quoted = tweets.get(tweet["quoted_status_id_str"]) + if quoted: + tweet["full_text_quoted"] = quoted["full_text"] + if "extended_entities" in quoted: + tweet["extended_entities"] = \ + quoted["extended_entities"] + elif "retweeted_status_id_str" in tweet: + retweet = tweets.get(tweet["retweeted_status_id_str"]) + if retweet: + tweet["author"] = users[retweet["user_id_str"]] + + yield tweet + + elif entry["entryId"].startswith(entry_cursor): + cursor = entry["content"]["operation"]["cursor"] + if not cursor.get("stopOnEmptyResponse"): + # keep going even if there are no tweets + tweet = True + cursor = cursor["value"] + + if "replaceEntry" in instr[-1] : + cursor = (instr[-1]["replaceEntry"]["entry"] + ["content"]["operation"]["cursor"]["value"]) + + if not cursor or not tweet: + return + params["cursor"] = cursor -@memcache() +@cache(maxage=3600) def _guest_token(extr, headers): return extr.request( "https://api.twitter.com/1.1/guest/activate.json", |
