diff options
| author | 2020-10-25 17:59:22 -0400 | |
|---|---|---|
| committer | 2020-10-25 17:59:22 -0400 | |
| commit | 5dc7d6f5902ddaee5223d041d5c10060f0c72430 (patch) | |
| tree | 6ddd103a86ea7bbb0d695f5fdfa55e43f04756ca /gallery_dl/extractor/twitter.py | |
| parent | e0c914765184ebbf99cffdecfe8cdbe10f42486e (diff) | |
New upstream version 1.15.2.upstream/1.15.2
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 145 |
1 files changed, 83 insertions, 62 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c98a300..06973b2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -9,10 +9,8 @@ """Extractors for https://twitter.com/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache -import hashlib -import time BASE_PATTERN = ( @@ -29,7 +27,6 @@ class TwitterExtractor(Extractor): archive_fmt = "{tweet_id}_{retweet_id}_{num}" cookiedomain = ".twitter.com" root = "https://twitter.com" - sizes = (":orig", ":large", ":medium", ":small") def __init__(self, match): Extractor.__init__(self, match) @@ -39,6 +36,7 @@ class TwitterExtractor(Extractor): self.twitpic = self.config("twitpic", False) self.quoted = self.config("quoted", True) self.videos = self.config("videos", True) + self.cards = self.config("cards", False) self._user_cache = {} def items(self): @@ -58,56 +56,82 @@ class TwitterExtractor(Extractor): self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"]) continue + files = [] + if "extended_entities" in tweet: + self._extract_media(tweet, files) + if "card" in tweet and self.cards: + self._extract_card(tweet, files) if self.twitpic: - self._extract_twitpic(tweet) - if "extended_entities" not in tweet: + self._extract_twitpic(tweet, files) + if not files: continue tdata = self._transform_tweet(tweet) tdata.update(metadata) - yield Message.Directory, tdata - for tdata["num"], media in enumerate( - tweet["extended_entities"]["media"], 1): - - tdata["width"] = media["original_info"].get("width", 0) - tdata["height"] = media["original_info"].get("height", 0) - - if "video_info" in media: - - if self.videos == "ytdl": - url = "ytdl:{}/i/web/status/{}".format( - self.root, tweet["id_str"]) - tdata["extension"] = None - yield Message.Url, url, tdata - - elif self.videos: - video_info = media["video_info"] - variant = max( - video_info["variants"], - key=lambda v: v.get("bitrate", 0), - ) - tdata["duration"] = video_info.get( - "duration_millis", 0) / 1000 - tdata["bitrate"] = variant.get("bitrate", 0) - - url = variant["url"] - text.nameext_from_url(url, tdata) - yield Message.Url, url, tdata - - elif "media_url_https" in media: - url = media["media_url_https"] - urls = [url + size for size in self.sizes] - text.nameext_from_url(url, tdata) - yield Message.Urllist, urls, tdata - - else: - url = media["media_url"] - text.nameext_from_url(url, tdata) - yield Message.Url, url, tdata + for tdata["num"], file in enumerate(files, 1): + file.update(tdata) + url = file.pop("url") + if "extension" not in file: + text.nameext_from_url(url, file) + yield Message.Url, url, file + + def _extract_media(self, tweet, files): + for media in tweet["extended_entities"]["media"]: + width = media["original_info"].get("width", 0), + height = media["original_info"].get("height", 0), + + if "video_info" in media: + if self.videos == "ytdl": + files.append({ + "url": "ytdl:{}/i/web/status/{}".format( + self.root, tweet["id_str"]), + "width" : width, + "height" : height, + "extension": None, + }) + elif self.videos: + video_info = media["video_info"] + variant = max( + video_info["variants"], + key=lambda v: v.get("bitrate", 0), + ) + files.append({ + "url" : variant["url"], + "width" : width, + "height" : height, + "bitrate" : variant.get("bitrate", 0), + "duration": video_info.get( + "duration_millis", 0) / 1000, + }) + elif "media_url_https" in media: + url = media["media_url_https"] + files.append(text.nameext_from_url(url, { + "url" : url + ":orig", + "_fallback": [url+":large", url+":medium", url+":small"], + "width" : width, + "height" : height, + })) + else: + files.append({"url": media["media_url"]}) + + def _extract_card(self, tweet, files): + card = tweet["card"] + if card["name"] in ("summary", "summary_large_image"): + bvals = card["binding_values"] + for prefix in ("photo_image_full_size_", + "summary_photo_image_", + "thumbnail_image_"): + for size in ("original", "x_large", "large", "small"): + key = prefix + size + if key in bvals: + files.append(bvals[key]["image_value"]) + return + else: + url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"]) + files.append({"url": url}) - def _extract_twitpic(self, tweet): - twitpics = [] + def _extract_twitpic(self, tweet, files): for url in tweet["entities"].get("urls", ()): url = url["expanded_url"] if "//twitpic.com/" in url and "/photos/" not in url: @@ -117,15 +141,7 @@ class TwitterExtractor(Extractor): url = text.extract( response.text, 'name="twitter:image" value="', '"')[0] if url: - twitpics.append({ - "original_info": {}, - "media_url" : url, - }) - if twitpics: - if "extended_entities" in tweet: - tweet["extended_entities"]["media"].extend(twitpics) - else: - tweet["extended_entities"] = {"media": twitpics} + files.append({"url": url}) def _transform_tweet(self, tweet): entities = tweet["entities"] @@ -247,7 +263,7 @@ class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" pattern = BASE_PATTERN + \ - r"/(?!search)(?:([^/?&#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))" + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))" test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", @@ -271,7 +287,7 @@ class TwitterTimelineExtractor(TwitterExtractor): class TwitterMediaExtractor(TwitterExtractor): """Extractor for all images from a user's Media Tweets""" subcategory = "media" - pattern = BASE_PATTERN + r"/(?!search)([^/?&#]+)/media(?!\w)" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/media(?!\w)" test = ( ("https://twitter.com/supernaturepics/media", { "range": "1-40", @@ -288,7 +304,7 @@ class TwitterMediaExtractor(TwitterExtractor): class TwitterLikesExtractor(TwitterExtractor): """Extractor for liked tweets""" subcategory = "likes" - pattern = BASE_PATTERN + r"/(?!search)([^/?&#]+)/likes(?!\w)" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)" test = ("https://twitter.com/supernaturepics/likes",) def tweets(self): @@ -326,7 +342,7 @@ class TwitterSearchExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for images from individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?&#]+|i/web)/status/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", @@ -375,11 +391,16 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", "count": 3, }), - # Nitter tweet + # Nitter tweet (#890) ("https://nitter.net/ed1conf/status/1163841619336007680", { "url": "0f6a841e23948e4320af7ae41125e0c5b3cadc98", "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", }), + # Twitter card (#1005) + ("https://twitter.com/billboard/status/1306599586602135555", { + "options": (("cards", True),), + "pattern": r"https://pbs.twimg.com/card_img/1317274761030856707/", + }), # original retweets (#1026) ("https://twitter.com/jessica_3978/status/1296304589591810048", { "options": (("retweets", "original"),), @@ -446,7 +467,7 @@ class TwitterAPI(): cookies = self.extractor.session.cookies # CSRF - csrf = hashlib.md5(str(time.time()).encode()).hexdigest() + csrf = util.generate_csrf_token() self.headers["x-csrf-token"] = csrf cookies.set("ct0", csrf, domain=".twitter.com") |
