diff options
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 116 |
1 files changed, 97 insertions, 19 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 8105ede..dfafc1f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache +from ..cache import cache, memcache import re @@ -26,6 +26,7 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + self.logged_in = False self.retweets = self.config("retweets", True) self.content = self.config("content", False) self.videos = self.config("videos", False) @@ -53,10 +54,20 @@ class TwitterExtractor(Extractor): yield Message.Urllist, urls, data if self.videos and "-videoContainer" in tweet: + if self.videos == "ytdl": + data["extension"] = None + url = "ytdl:{}/{}/status/{}".format( + self.root, data["user"], data["tweet_id"]) + else: + url = self._video_from_tweet(data["tweet_id"]) + ext = text.ext_from_url(url) + if ext == "m3u8": + url = "ytdl:" + url + data["extension"] = "mp4" + data["_ytdl_extra"] = {"protocol": "m3u8_native"} + else: + data["extension"] = ext data["num"] = 1 - data["extension"] = None - url = "ytdl:{}/{}/status/{}".format( - self.root, data["user"], data["tweet_id"]) yield Message.Url, url, data def metadata(self): @@ -70,6 +81,7 @@ class TwitterExtractor(Extractor): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) + self.logged_in = True @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -115,17 +127,48 @@ class TwitterExtractor(Extractor): data["content"] = cl if cl and len(cr) < 16 else content return data - def _tweets_from_api(self, url): + def _video_from_tweet(self, tweet_id): + url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format( + tweet_id) + cookies = None + headers = { + "Origin" : self.root, + "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id), + "x-csrf-token" : self.session.cookies.get("ct0"), + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM" + "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N" + "HfOPqkca3qaAxGfsyKCs0wRbw", + } + + if self.logged_in: + headers["x-twitter-auth-type"] = "OAuth2Session" + else: + token = self._guest_token(headers) + cookies = {"gt": token} + headers["x-guest-token"] = token + + data = self.request(url, cookies=cookies, headers=headers).json() + return data["track"]["playbackUrl"] + + @memcache() + def _guest_token(self, headers): + return self.request( + "https://api.twitter.com/1.1/guest/activate.json", + method="POST", headers=headers, + ).json().get("guest_token") + + def _tweets_from_api(self, url, max_position=None): params = { "include_available_features": "1", "include_entities": "1", + "max_position": max_position, "reset_error_state": "false", "lang": "en", } headers = { "X-Requested-With": "XMLHttpRequest", "X-Twitter-Active-User": "yes", - "Referer": "{}/{}".format(self.root, self.user) + "Referer": self.root + "/", } while True: @@ -140,18 +183,23 @@ class TwitterExtractor(Extractor): if not data["has_more_items"]: return - position = text.parse_int(text.extract( - tweet, 'data-tweet-id="', '"')[0]) - if "max_position" in params and position >= params["max_position"]: - return - params["max_position"] = position + if "min_position" in data: + position = data["min_position"] + if position == max_position: + return + else: + position = text.parse_int(text.extract( + tweet, 'data-tweet-id="', '"')[0]) + if max_position and position >= max_position: + return + params["max_position"] = max_position = position class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" - r"/([^/?&#]+)/?(?:$|[?#])") + r"/(?!search)([^/?&#]+)/?(?:$|[?#])") test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", @@ -171,7 +219,7 @@ class TwitterMediaExtractor(TwitterExtractor): """Extractor for all images from a user's Media Tweets""" subcategory = "media" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" - r"/([^/?&#]+)/media(?!\w)") + r"/(?!search)([^/?&#]+)/media(?!\w)") test = ( ("https://twitter.com/supernaturepics/media", { "range": "1-40", @@ -186,6 +234,26 @@ class TwitterMediaExtractor(TwitterExtractor): return self._tweets_from_api(url) +class TwitterSearchExtractor(TwitterExtractor): + """Extractor for all images from a search timeline""" + subcategory = "search" + directory_fmt = ("{category}", "Search", "{search}") + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)") + test = ("https://twitter.com/search?q=nature", { + "range": "1-40", + "count": 40, + }) + + def metadata(self): + return {"search": self.user} + + def tweets(self): + url = "{}/i/search/timeline?f=tweets&q={}".format( + self.root, self.user) + return self._tweets_from_api(url, "-1") + + class TwitterTweetExtractor(TwitterExtractor): """Extractor for images from individual tweets""" subcategory = "tweet" @@ -205,17 +273,17 @@ class TwitterTweetExtractor(TwitterExtractor): # video ("https://twitter.com/perrypumas/status/1065692031626829824", { "options": (("videos", True),), - "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+", + "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/yumi_san0112/status/1151144618936823808", { "options": (("content", True),), - "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e", + "keyword": "b133464b73aec33871521ab021a3166204194285", }), # Reply to another tweet (#403) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { - "options": (("videos", True),), - "pattern": r"ytdl:https://twitter.com/.*/1103767554424598528$", + "options": (("videos", "ytdl"),), + "pattern": r"ytdl:https://twitter.com/.+/1103767554424598528", }), # /i/web/ URL ("https://twitter.com/i/web/status/1155074198240292865", { @@ -231,9 +299,19 @@ class TwitterTweetExtractor(TwitterExtractor): return {"user": self.user, "tweet_id": self.tweet_id} def tweets(self): - self.session.cookies.clear() url = "{}/i/web/status/{}".format(self.root, self.tweet_id) - page = self.request(url).text + cookies = {"app_shell_visited": "1"} + headers = { + "Referer" : url, + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; " + "Trident/7.0; rv:11.0) like Gecko", + } + + response = self.request(url, cookies=cookies, headers=headers) + if response.history and response.url == self.root + "/": + raise exception.AuthorizationError() + page = response.text + end = page.index('class="js-tweet-stats-container') beg = page.rindex('<div class="tweet ', 0, end) return (page[beg:end],) |
