diff options
| author | 2023-01-11 04:09:13 -0500 | |
|---|---|---|
| committer | 2023-01-11 04:09:13 -0500 | |
| commit | fe385c3ff784ba3d19454a35446502c0ec295893 (patch) | |
| tree | 897982793ef2a0c0f349044bf4cf803ccd483e6e /gallery_dl/extractor/twitter.py | |
| parent | ebdfcd3cd3f76534a590ba08933ff7ea54813316 (diff) | |
New upstream version 1.24.3.upstream/1.24.3
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 74 |
1 files changed, 46 insertions, 28 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 22aa78e..c2d8247 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -39,6 +39,7 @@ class TwitterExtractor(Extractor): self.videos = self.config("videos", True) self.cards = self.config("cards", False) self.cards_blacklist = self.config("cards-blacklist") + self.syndication = self.config("syndication") self._user = self._user_obj = None self._user_cache = {} self._init_sizes() @@ -75,11 +76,6 @@ class TwitterExtractor(Extractor): else: data = tweet - if seen_tweets is not None: - if data["id_str"] in seen_tweets: - continue - seen_tweets.add(data["id_str"]) - if not self.retweets and "retweeted_status_id_str" in data: self.log.debug("Skipping %s (retweet)", data["id_str"]) continue @@ -97,6 +93,13 @@ class TwitterExtractor(Extractor): self.log.debug("Skipping %s (reply)", data["id_str"]) continue + if seen_tweets is not None: + if data["id_str"] in seen_tweets: + self.log.debug( + "Skipping %s (previously seen)", data["id_str"]) + continue + seen_tweets.add(data["id_str"]) + files = [] if "extended_entities" in data: self._extract_media( @@ -220,14 +223,16 @@ class TwitterExtractor(Extractor): def _extract_twitpic(self, tweet, files): for url in tweet["entities"].get("urls", ()): url = url["expanded_url"] - if "//twitpic.com/" in url and "/photos/" not in url: - response = self.request(url, fatal=False) - if response.status_code >= 400: - continue - url = text.extr( - response.text, 'name="twitter:image" value="', '"') - if url: - files.append({"url": url}) + if "//twitpic.com/" not in url or "/photos/" in url: + continue + if url.startswith("http:"): + url = "https" + url[4:] + response = self.request(url, fatal=False) + if response.status_code >= 400: + continue + url = text.extr(response.text, 'name="twitter:image" value="', '"') + if url: + files.append({"url": url}) def _transform_tweet(self, tweet): if "author" in tweet: @@ -299,6 +304,9 @@ class TwitterExtractor(Extractor): if "legacy" in user: user = user["legacy"] + elif "statuses_count" not in user and self.syndication == "extended": + # try to fetch extended user data + user = self.api.user_by_screen_name(user["screen_name"])["legacy"] uget = user.get entities = user["entities"] @@ -361,18 +369,22 @@ class TwitterExtractor(Extractor): def _expand_tweets(self, tweets): seen = set() for tweet in tweets: - - if "legacy" in tweet: - cid = tweet["legacy"]["conversation_id_str"] - else: - cid = tweet["conversation_id_str"] - - if cid not in seen: - seen.add(cid) - try: - yield from self.api.tweet_detail(cid) - except Exception: - yield tweet + obj = tweet["legacy"] if "legacy" in tweet else tweet + cid = obj.get("conversation_id_str") + if not cid: + tid = obj["id_str"] + self.log.warning( + "Unable to expand %s (no 'conversation_id')", tid) + continue + if cid in seen: + self.log.debug( + "Skipping expansion of %s (previously seen)", cid) + continue + seen.add(cid) + try: + yield from self.api.tweet_detail(cid) + except Exception: + yield tweet def _make_tweet(self, user, id_str, url, timestamp): return { @@ -772,7 +784,7 @@ class TwitterTweetExtractor(TwitterExtractor): # age-restricted (#2354) ("https://twitter.com/mightbecursed/status/1492954264909479936", { "options": (("syndication", True),), - "keywords": {"date": "dt:2022-02-13 20:10:09"}, + "keyword": {"date": "dt:2022-02-13 20:10:09"}, "count": 1, }), # media alt texts / descriptions (#2617) @@ -991,7 +1003,7 @@ class TwitterAPI(): } self._nsfw_warning = True - self._syndication = extractor.config("syndication") + self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode cookies = extractor.session.cookies @@ -1516,6 +1528,12 @@ class TwitterAPI(): else: retweet_id = None + # assume 'conversation_id' is the same as 'id' when the tweet + # is not a reply + if "conversation_id_str" not in tweet and \ + "in_reply_to_status_id_str" not in tweet: + tweet["conversation_id_str"] = tweet["id_str"] + tweet["created_at"] = text.parse_datetime( tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime( "%a %b %d %H:%M:%S +0000 %Y") |
