diff options
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 27 |
1 files changed, 23 insertions, 4 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ad4dc46..ccba640 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache +import re class TwitterExtractor(Extractor): @@ -26,8 +27,13 @@ class TwitterExtractor(Extractor): Extractor.__init__(self, match) self.user = match.group(1) self.retweets = self.config("retweets", True) + self.content = self.config("content", False) self.videos = self.config("videos", False) + if self.content: + self._emoji_sub = re.compile( + r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub + def items(self): self.login() yield Message.Version, 1 @@ -35,6 +41,7 @@ class TwitterExtractor(Extractor): for tweet in self.tweets(): data = self._data_from_tweet(tweet) + if not self.retweets and data["retweet_id"]: continue @@ -87,10 +94,9 @@ class TwitterExtractor(Extractor): raise exception.AuthenticationError() return self.session.cookies - @staticmethod - def _data_from_tweet(tweet): + def _data_from_tweet(self, tweet): extr = text.extract_from(tweet) - return { + data = { "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), "retweeter" : extr('data-retweeter="' , '"'), @@ -99,6 +105,14 @@ class TwitterExtractor(Extractor): "user_id" : text.parse_int(extr('data-user-id="' , '"')), "date" : text.parse_timestamp(extr('data-time="', '"')), } + if self.content: + content = extr('<div class="js-tweet-text-container">', '\n</div>') + if '<img class="Emoji ' in content: + content = self._emoji_sub(r"\1", content) + content = text.unescape(text.remove_html(content, "", "")) + cl, _, cr = content.rpartition("pic.twitter.com/") + data["content"] = cl if cl and len(cr) < 16 else content + return data def _tweets_from_api(self, url): params = { @@ -186,6 +200,11 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("videos", True),), "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+", }), + # content with emoji, newlines, hashtags (#338) + ("https://twitter.com/yumi_san0112/status/1151144618936823808", { + "options": (("content", True),), + "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e", + }), ) def __init__(self, match): @@ -199,4 +218,4 @@ class TwitterTweetExtractor(TwitterExtractor): url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id) page = self.request(url).text return (text.extract( - page, '<div class="tweet ', '<ul class="stats')[0],) + page, '<div class="tweet ', 'class="js-tweet-stats-container')[0],) |
