diff options
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 118 |
1 files changed, 102 insertions, 16 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index dc558c0..2a04463 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://twitter.com/""" +"""Extractors for https://twitter.com/""" from .common import Extractor, Message from .. import text, exception @@ -21,8 +21,11 @@ class TwitterExtractor(Extractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" + cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") + user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; " + "Trident/7.0; rv:11.0) like Gecko") def __init__(self, match): Extractor.__init__(self, match) @@ -32,7 +35,7 @@ class TwitterExtractor(Extractor): self.retweets = self.config("retweets", True) self.twitpic = self.config("twitpic", False) self.content = self.config("content", False) - self.videos = self.config("videos", False) + self.videos = self.config("videos", True) if self.content: self._emoji_sub = re.compile( @@ -117,7 +120,8 @@ class TwitterExtractor(Extractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - page = self.request(self.root + "/login").text + headers = {"User-Agent": self.user_agent} + page = self.request(self.root + "/login", headers=headers).text pos = page.index('name="authenticity_token"') token = text.extract(page, 'value="', '"', pos-80)[0] @@ -131,11 +135,15 @@ class TwitterExtractor(Extractor): "redirect_after_login" : "", "remember_me" : "1", } - response = self.request(url, method="POST", data=data) - + response = self.request(url, method="POST", headers=headers, data=data) if "/error" in response.url: raise exception.AuthenticationError() - return self.session.cookies + + return { + cookie.name: cookie.value + for cookie in self.session.cookies + if cookie.domain and "twitter.com" in cookie.domain + } def _data_from_tweet(self, tweet): extr = text.extract_from(tweet) @@ -353,7 +361,11 @@ class TwitterTweetExtractor(TwitterExtractor): # content with emoji, newlines, hashtags (#338) ("https://twitter.com/yumi_san0112/status/1151144618936823808", { "options": (("content", True),), - "keyword": "0b7a3d05607b480c1412dfd85f8606478313e7bf", + "keyword": {"content": ( + "re:晴、お誕生日おめでとう🎉!\n実は下の名前が同じなので結構親近感ある" + "アイドルです✨\n今年の晴ちゃんめちゃくちゃ可愛い路線攻めてるから、そろ" + "そろまたかっこいい晴が見たいですねw\n#結城晴生誕祭2019\n#結城晴生誕祭" + )}, }), # Reply to another tweet (#403) ("https://twitter.com/tyson_hesse/status/1103767554424598528", { @@ -365,9 +377,12 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://pbs.twimg.com/media/EAel0vUUYAAZ4Bq.jpg:orig", }), # quoted tweet (#526) - ("https://twitter.com/Meiyu_miu/status/1070693241413021696", { - "count": 4, - "keyword": "0c627af2b8cdccc7e0da8fd221155c4a4a3141a8", + ("https://twitter.com/Pistachio/status/1222690391817932803", { + "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", + "keyword": { + "author": {"name": "Afro_Herper", "id": 786047748508221440}, + "user" : {"name": "Pistachio" , "id": 3533231}, + }, }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { @@ -384,11 +399,7 @@ class TwitterTweetExtractor(TwitterExtractor): def tweets(self): url = "{}/i/web/status/{}".format(self.root, self.tweet_id) cookies = {"app_shell_visited": "1"} - headers = { - "Referer" : url, - "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; " - "Trident/7.0; rv:11.0) like Gecko", - } + headers = {"User-Agent": self.user_agent, "Referer": url} response = self.request(url, cookies=cookies, headers=headers) if response.history and response.url == self.root + "/": @@ -400,6 +411,81 @@ class TwitterTweetExtractor(TwitterExtractor): return (page[beg:end],) +class TwitterBookmarkExtractor(TwitterExtractor): + """Extractor for bookmarked tweets""" + subcategory = "bookmark" + pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" + test = ("https://twitter.com/i/bookmarks",) + + def items(self): + self.login() + if not self.logged_in: + raise exception.AuthorizationError("Login required") + for cookie in self.session.cookies: + cookie.expires = None + + url = "https://api.twitter.com/2/timeline/bookmark.json" + params = { + "include_profile_interstitial_type": "1", + "include_blocking": "1", + "include_blocked_by": "1", + "include_followed_by": "1", + "include_want_retweets": "1", + "include_mute_edge": "1", + "include_can_dm": "1", + "include_can_media_tag": "1", + "skip_status": "1", + "cards_platform": "Web-12", + "include_cards": "1", + "include_composer_source": "true", + "include_ext_alt_text": "true", + "include_reply_count": "1", + "tweet_mode": "extended", + "include_entities": "true", + "include_user_entities": "true", + "include_ext_media_color": "true", + "include_ext_media_availability": "true", + "send_error_codes": "true", + "simple_quoted_tweets": "true", + "count": "100", + "cursor": None, + "ext": "mediaStats%2CcameraMoment", + } + headers = { + "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" + "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu" + "4FA33AGWWjCpTnA", + "Origin": self.root, + "Referer": self.root + "/i/bookmarks", + "x-csrf-token": self.session.cookies.get("ct0"), + "x-twitter-active-user": "yes", + "x-twitter-auth-type": "Auth2Session", + "x-twitter-client-language": "en", + } + + while True: + response = self.request( + url, params=params, headers=headers, fatal=False) + if response.status_code >= 400: + raise exception.StopExtraction(response.text) + data = response.json() + tweets = data["globalObjects"]["tweets"] + + if not tweets: + return + for tweet_id, tweet_data in tweets.items(): + tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id) + tweet_data["_extractor"] = TwitterTweetExtractor + yield Message.Queue, tweet_url, tweet_data + + inst = data["timeline"]["instructions"][0] + for entry in inst["addEntries"]["entries"]: + if entry["entryId"].startswith("cursor-bottom-"): + params["cursor"] = \ + entry["content"]["operation"]["cursor"]["value"] + break + + @memcache() def _guest_token(extr, headers): return extr.request( |
