New upstream version 1.24.3.upstream/1.24.3

author: Unit 193 <unit193@unit193.net> 2023-01-11 04:09:13 -0500
committer: Unit 193 <unit193@unit193.net> 2023-01-11 04:09:13 -0500
commit: fe385c3ff784ba3d19454a35446502c0ec295893 (patch)
tree: 897982793ef2a0c0f349044bf4cf803ccd483e6e /gallery_dl/extractor/twitter.py
parent: ebdfcd3cd3f76534a590ba08933ff7ea54813316 (diff)
1 files changed, 46 insertions, 28 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 22aa78e..c2d8247 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2016-2022 Mike Fährmann
+# Copyright 2016-2023 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -39,6 +39,7 @@ class TwitterExtractor(Extractor):
         self.videos = self.config("videos", True)
         self.cards = self.config("cards", False)
         self.cards_blacklist = self.config("cards-blacklist")
+        self.syndication = self.config("syndication")
         self._user = self._user_obj = None
         self._user_cache = {}
         self._init_sizes()
@@ -75,11 +76,6 @@ class TwitterExtractor(Extractor):
             else:
                 data = tweet
 
-            if seen_tweets is not None:
-                if data["id_str"] in seen_tweets:
-                    continue
-                seen_tweets.add(data["id_str"])
-
             if not self.retweets and "retweeted_status_id_str" in data:
                 self.log.debug("Skipping %s (retweet)", data["id_str"])
                 continue
@@ -97,6 +93,13 @@ class TwitterExtractor(Extractor):
                 self.log.debug("Skipping %s (reply)", data["id_str"])
                 continue
 
+            if seen_tweets is not None:
+                if data["id_str"] in seen_tweets:
+                    self.log.debug(
+                        "Skipping %s (previously seen)", data["id_str"])
+                    continue
+                seen_tweets.add(data["id_str"])
+
             files = []
             if "extended_entities" in data:
                 self._extract_media(
@@ -220,14 +223,16 @@ class TwitterExtractor(Extractor):
     def _extract_twitpic(self, tweet, files):
         for url in tweet["entities"].get("urls", ()):
             url = url["expanded_url"]
-            if "//twitpic.com/" in url and "/photos/" not in url:
-                response = self.request(url, fatal=False)
-                if response.status_code >= 400:
-                    continue
-                url = text.extr(
-                    response.text, 'name="twitter:image" value="', '"')
-                if url:
-                    files.append({"url": url})
+            if "//twitpic.com/" not in url or "/photos/" in url:
+                continue
+            if url.startswith("http:"):
+                url = "https" + url[4:]
+            response = self.request(url, fatal=False)
+            if response.status_code >= 400:
+                continue
+            url = text.extr(response.text, 'name="twitter:image" value="', '"')
+            if url:
+                files.append({"url": url})
 
     def _transform_tweet(self, tweet):
         if "author" in tweet:
@@ -299,6 +304,9 @@ class TwitterExtractor(Extractor):
 
         if "legacy" in user:
             user = user["legacy"]
+        elif "statuses_count" not in user and self.syndication == "extended":
+            # try to fetch extended user data
+            user = self.api.user_by_screen_name(user["screen_name"])["legacy"]
 
         uget = user.get
         entities = user["entities"]
@@ -361,18 +369,22 @@ class TwitterExtractor(Extractor):
     def _expand_tweets(self, tweets):
         seen = set()
         for tweet in tweets:
-
-            if "legacy" in tweet:
-                cid = tweet["legacy"]["conversation_id_str"]
-            else:
-                cid = tweet["conversation_id_str"]
-
-            if cid not in seen:
-                seen.add(cid)
-                try:
-                    yield from self.api.tweet_detail(cid)
-                except Exception:
-                    yield tweet
+            obj = tweet["legacy"] if "legacy" in tweet else tweet
+            cid = obj.get("conversation_id_str")
+            if not cid:
+                tid = obj["id_str"]
+                self.log.warning(
+                    "Unable to expand %s (no 'conversation_id')", tid)
+                continue
+            if cid in seen:
+                self.log.debug(
+                    "Skipping expansion of %s (previously seen)", cid)
+                continue
+            seen.add(cid)
+            try:
+                yield from self.api.tweet_detail(cid)
+            except Exception:
+                yield tweet
 
     def _make_tweet(self, user, id_str, url, timestamp):
         return {
@@ -772,7 +784,7 @@ class TwitterTweetExtractor(TwitterExtractor):
         # age-restricted (#2354)
         ("https://twitter.com/mightbecursed/status/1492954264909479936", {
             "options": (("syndication", True),),
-            "keywords": {"date": "dt:2022-02-13 20:10:09"},
+            "keyword": {"date": "dt:2022-02-13 20:10:09"},
             "count": 1,
         }),
         # media alt texts / descriptions (#2617)
@@ -991,7 +1003,7 @@ class TwitterAPI():
         }
 
         self._nsfw_warning = True
-        self._syndication = extractor.config("syndication")
+        self._syndication = self.extractor.syndication
         self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
 
         cookies = extractor.session.cookies
@@ -1516,6 +1528,12 @@ class TwitterAPI():
         else:
             retweet_id = None
 
+        # assume 'conversation_id' is the same as 'id' when the tweet
+        # is not a reply
+        if "conversation_id_str" not in tweet and \
+                "in_reply_to_status_id_str" not in tweet:
+            tweet["conversation_id_str"] = tweet["id_str"]
+
         tweet["created_at"] = text.parse_datetime(
             tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime(
             "%a %b %d %H:%M:%S +0000 %Y")
author	Unit 193 <unit193@unit193.net>	2023-01-11 04:09:13 -0500
committer	Unit 193 <unit193@unit193.net>	2023-01-11 04:09:13 -0500
commit	fe385c3ff784ba3d19454a35446502c0ec295893 (patch)
tree	897982793ef2a0c0f349044bf4cf803ccd483e6e /gallery_dl/extractor/twitter.py
parent	ebdfcd3cd3f76534a590ba08933ff7ea54813316 (diff)