diff options
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 252 |
1 files changed, 170 insertions, 82 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index bf125a6..546e8e1 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -76,7 +76,7 @@ class TwitterExtractor(Extractor): seen_tweets = set() if self.config("unique", True) else None if self.twitpic: - self._find_twitpic = util.re( + self._find_twitpic = text.re( r"https?(://twitpic\.com/(?!photos/)\w+)").findall tweets = self.tweets() @@ -124,12 +124,11 @@ class TwitterExtractor(Extractor): tdata = self._transform_tweet(tweet) tdata.update(metadata) tdata["count"] = len(files) - yield Message.Directory, tdata + yield Message.Directory, "", tdata - del tdata["source_id"] - del tdata["sensitive_flags"] - if "source_user" in tdata: - del tdata["source_user"] + tdata.pop("source_id", None) + tdata.pop("source_user", None) + tdata.pop("sensitive_flags", None) for tdata["num"], file in enumerate(files, 1): file.update(tdata) @@ -146,7 +145,7 @@ class TwitterExtractor(Extractor): self._extract_media( data, data["extended_entities"]["media"], files) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error while extracting media files (%s: %s)", data["id_str"], exc.__class__.__name__, exc) @@ -155,7 +154,7 @@ class TwitterExtractor(Extractor): try: self._extract_card(tweet, files) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error while extracting Card files (%s: %s)", data["id_str"], exc.__class__.__name__, exc) @@ -164,7 +163,7 @@ class TwitterExtractor(Extractor): try: self._extract_twitpic(data, files) except Exception as exc: - self.log.debug("", exc_info=exc) + self.log.traceback(exc) self.log.warning( "%s: Error while extracting TwitPic files (%s: %s)", data["id_str"], exc.__class__.__name__, exc) @@ -347,32 +346,36 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _transform_tweet(self, tweet): + if "legacy" in tweet: + legacy = tweet["legacy"] + else: + legacy = tweet + tweet_id = int(legacy["id_str"]) + if "author" in tweet: author = tweet["author"] elif "core" in tweet: - author = tweet["core"]["user_results"]["result"] + try: + author = tweet["core"]["user_results"]["result"] + except KeyError: + self.log.warning("%s: Missing 'author' data", tweet_id) + author = util.NONE else: author = tweet["user"] author = self._transform_user(author) - if "legacy" in tweet: - legacy = tweet["legacy"] - else: - legacy = tweet - tget = legacy.get - - tweet_id = int(legacy["id_str"]) if tweet_id >= 300000000000000: - date = text.parse_timestamp( + date = self.parse_timestamp( ((tweet_id >> 22) + 1288834974657) // 1000) else: try: - date = text.parse_datetime( + date = self.parse_datetime( legacy["created_at"], "%a %b %d %H:%M:%S %z %Y") except Exception: date = util.NONE source = tweet.get("source") + tget = legacy.get tdata = { "tweet_id" : tweet_id, "retweet_id" : text.parse_int( @@ -439,6 +442,8 @@ class TwitterExtractor(Extractor): txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content + if "pinned" in tweet: + tdata["pinned"] = True if "birdwatch_pivot" in tweet: try: tdata["birdwatch"] = \ @@ -455,7 +460,7 @@ class TwitterExtractor(Extractor): tdata, legacy["extended_entities"]["media"][0]) if tdata["retweet_id"]: tdata["content"] = f"RT @{author['name']}: {tdata['content']}" - tdata["date_original"] = text.parse_timestamp( + tdata["date_original"] = self.parse_timestamp( ((tdata["retweet_id"] >> 22) + 1288834974657) // 1000) return tdata @@ -492,7 +497,7 @@ class TwitterExtractor(Extractor): "id": text.parse_int(cid), "name": com.get("name"), "description": com.get("description"), - "date": text.parse_timestamp(com.get("created_at", 0) // 1000), + "date": self.parse_timestamp(com.get("created_at", 0) // 1000), "nsfw": com.get("is_nsfw"), "role": com.get("role"), "member_count": com.get("member_count"), @@ -528,13 +533,13 @@ class TwitterExtractor(Extractor): "id" : text.parse_int(uid), "name" : core.get("screen_name"), "nick" : core.get("name"), - "location" : user["location"]["location"], - "date" : text.parse_datetime( + "location" : user["location"].get("location"), + "date" : self.parse_datetime( core["created_at"], "%a %b %d %H:%M:%S %z %Y"), "verified" : user["verification"]["verified"], "protected" : user["privacy"]["protected"], "profile_banner" : lget("profile_banner_url", ""), - "profile_image" : user["avatar"]["image_url"].replace( + "profile_image" : user["avatar"].get("image_url", "").replace( "_normal.", "."), "favourites_count": lget("favourites_count"), "followers_count" : lget("followers_count"), @@ -591,9 +596,12 @@ class TwitterExtractor(Extractor): obj = tweet["legacy"] if "legacy" in tweet else tweet cid = obj.get("conversation_id_str") if not cid: - tid = obj["id_str"] - self.log.warning( - "Unable to expand %s (no 'conversation_id')", tid) + if cid is False: + yield tweet + else: + tid = obj["id_str"] + self.log.warning( + "Unable to expand %s (no 'conversation_id')", tid) continue if cid in seen: self.log.debug( @@ -608,6 +616,7 @@ class TwitterExtractor(Extractor): def _make_tweet(self, user, url, id_str): return { "id_str": id_str, + "conversation_id_str": False, "lang": None, "user": user, "source": "><", @@ -658,8 +667,8 @@ class TwitterExtractor(Extractor): class TwitterHomeExtractor(TwitterExtractor): """Extractor for Twitter home timelines""" subcategory = "home" - pattern = (BASE_PATTERN + - r"/(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$") + pattern = (rf"{BASE_PATTERN}/" + rf"(?:home(?:/fo(?:llowing|r[-_ ]?you()))?|i/timeline)/?$") example = "https://x.com/home" def tweets(self): @@ -671,7 +680,7 @@ class TwitterHomeExtractor(TwitterExtractor): class TwitterSearchExtractor(TwitterExtractor): """Extractor for Twitter search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" + pattern = rf"{BASE_PATTERN}/search/?\?(?:[^&#]+&)*q=([^&#]+)" example = "https://x.com/search?q=QUERY" def metadata(self): @@ -702,7 +711,7 @@ class TwitterSearchExtractor(TwitterExtractor): class TwitterHashtagExtractor(TwitterExtractor): """Extractor for Twitter hashtags""" subcategory = "hashtag" - pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)" + pattern = rf"{BASE_PATTERN}/hashtag/([^/?#]+)" example = "https://x.com/hashtag/NAME" def items(self): @@ -713,7 +722,7 @@ class TwitterHashtagExtractor(TwitterExtractor): class TwitterUserExtractor(Dispatch, TwitterExtractor): """Extractor for a Twitter user""" - pattern = (BASE_PATTERN + r"/(?:" + pattern = (rf"{BASE_PATTERN}/(?:" r"([^/?#]+)/?(?:$|\?|#)" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") example = "https://x.com/USER" @@ -890,7 +899,7 @@ class TwitterLikesExtractor(TwitterExtractor): class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" subcategory = "bookmark" - pattern = BASE_PATTERN + r"/i/bookmarks()" + pattern = rf"{BASE_PATTERN}/i/bookmarks()" example = "https://x.com/i/bookmarks" def tweets(self): @@ -898,7 +907,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): def _transform_tweet(self, tweet): tdata = TwitterExtractor._transform_tweet(self, tweet) - tdata["date_bookmarked"] = text.parse_timestamp( + tdata["date_bookmarked"] = self.parse_timestamp( (int(tweet["sortIndex"] or 0) >> 20) // 1000) return tdata @@ -906,7 +915,7 @@ class TwitterBookmarkExtractor(TwitterExtractor): class TwitterListExtractor(TwitterExtractor): """Extractor for Twitter lists""" subcategory = "list" - pattern = BASE_PATTERN + r"/i/lists/(\d+)/?$" + pattern = rf"{BASE_PATTERN}/i/lists/(\d+)/?$" example = "https://x.com/i/lists/12345" def tweets(self): @@ -916,7 +925,7 @@ class TwitterListExtractor(TwitterExtractor): class TwitterListMembersExtractor(TwitterExtractor): """Extractor for members of a Twitter list""" subcategory = "list-members" - pattern = BASE_PATTERN + r"/i/lists/(\d+)/members" + pattern = rf"{BASE_PATTERN}/i/lists/(\d+)/members" example = "https://x.com/i/lists/12345/members" def items(self): @@ -952,7 +961,7 @@ class TwitterCommunityExtractor(TwitterExtractor): directory_fmt = ("{category}", "Communities", "{community[name]} ({community[id]})") archive_fmt = "C_{community[id]}_{tweet_id}_{num}" - pattern = BASE_PATTERN + r"/i/communities/(\d+)" + pattern = rf"{BASE_PATTERN}/i/communities/(\d+)" example = "https://x.com/i/communities/12345" def tweets(self): @@ -966,7 +975,7 @@ class TwitterCommunitiesExtractor(TwitterExtractor): subcategory = "communities" directory_fmt = TwitterCommunityExtractor.directory_fmt archive_fmt = TwitterCommunityExtractor.archive_fmt - pattern = BASE_PATTERN + r"/([^/?#]+)/communities/?$" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/communities/?$" example = "https://x.com/i/communities" def tweets(self): @@ -978,7 +987,7 @@ class TwitterEventExtractor(TwitterExtractor): subcategory = "event" directory_fmt = ("{category}", "Events", "{event[id]} {event[short_title]}") - pattern = BASE_PATTERN + r"/i/events/(\d+)" + pattern = rf"{BASE_PATTERN}/i/events/(\d+)" example = "https://x.com/i/events/12345" def metadata(self): @@ -991,7 +1000,7 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): """Extractor for individual tweets""" subcategory = "tweet" - pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + pattern = (rf"{BASE_PATTERN}/([^/?#]+|i/web)/status/(\d+)" r"/?(?:$|\?|#|photo/|video/)") example = "https://x.com/USER/status/12345" @@ -1072,7 +1081,7 @@ class TwitterTweetExtractor(TwitterExtractor): class TwitterQuotesExtractor(TwitterExtractor): """Extractor for quotes of a Tweet""" subcategory = "quotes" - pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes" + pattern = rf"{BASE_PATTERN}/(?:[^/?#]+|i/web)/status/(\d+)/quotes" example = "https://x.com/USER/status/12345/quotes" def items(self): @@ -1096,7 +1105,7 @@ class TwitterInfoExtractor(TwitterExtractor): else: user = api.user_by_screen_name(screen_name) - return iter(((Message.Directory, self._transform_user(user)),)) + return iter(((Message.Directory, "", self._transform_user(user)),)) class TwitterAvatarExtractor(TwitterExtractor): @@ -1162,7 +1171,7 @@ class TwitterImageExtractor(Extractor): "_fallback": TwitterExtractor._image_fallback(self, base), } - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, base + self._size_image, data @@ -1369,7 +1378,7 @@ class TwitterAPI(): endpoint = "/graphql/E8Wq-_jFSaU7hxVcuOPR9g/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withQuickPromoteEligibilityTweetFields": False, "withVoice": True, @@ -1384,7 +1393,7 @@ class TwitterAPI(): endpoint = "/graphql/-O3QOHrVn1aOm_cF5wyTCQ/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withCommunity": True, "withVoice": True, @@ -1399,7 +1408,7 @@ class TwitterAPI(): endpoint = "/graphql/gmHw9geMTncZ7jeLLUUNOw/UserHighlightsTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withVoice": True, } @@ -1413,7 +1422,7 @@ class TwitterAPI(): endpoint = "/graphql/jCRhbOzdgOHp6u9H4g2tEg/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withClientEventToken": False, "withBirdwatchNotes": False, @@ -1429,7 +1438,7 @@ class TwitterAPI(): endpoint = "/graphql/TGEKkJG_meudeaFcqaxM-Q/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "withClientEventToken": False, "withBirdwatchNotes": False, @@ -1444,32 +1453,45 @@ class TwitterAPI(): def user_bookmarks(self): endpoint = "/graphql/pLtjrO4ubNh996M_Cubwsg/Bookmarks" variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, } return self._pagination_tweets( endpoint, variables, ("bookmark_timeline_v2", "timeline"), stop_tweets=128) - def search_timeline(self, query, product="Latest"): + def search_timeline(self, query, product=None): + cfg = self.extractor.config + + if product is None: + if product := cfg("search-results"): + product = { + "top" : "Top", + "live" : "Latest", + "user" : "People", + "media": "Media", + "list" : "Lists", + }.get(product.lower(), product).capitalize() + else: + product = "Latest" + endpoint = "/graphql/4fpceYZ6-YQCx_JSl_Cn_A/SearchTimeline" variables = { "rawQuery": query, - "count": self.extractor.config("search-limit", 20), + "count": cfg("search-limit", 20), "querySource": "typed_query", "product": product, "withGrokTranslatedBio": False, } - if self.extractor.config("search-pagination") in ( - "max_id", "maxid", "id"): + if cfg("search-pagination") in ("max_id", "maxid", "id"): update_variables = self._update_variables_search else: update_variables = None - stop_tweets = self.extractor.config("search-stop") + stop_tweets = cfg("search-stop") if stop_tweets is None or stop_tweets == "auto": - stop_tweets = 3 if update_variables is None else 0 + stop_tweets = 3 return self._pagination_tweets( endpoint, variables, @@ -1494,7 +1516,7 @@ class TwitterAPI(): endpoint = "/graphql/Nyt-88UX4-pPCImZNUl9RQ/CommunityTweetsTimeline" variables = { "communityId": community_id, - "count": 100, + "count": self.extractor.config("limit", 50), "displayLocation": "Community", "rankingMode": "Recency", "withCommunity": True, @@ -1508,7 +1530,7 @@ class TwitterAPI(): endpoint = "/graphql/ZniZ7AAK_VVu1xtSx1V-gQ/CommunityMediaTimeline" variables = { "communityId": community_id, - "count": 100, + "count": self.extractor.config("limit", 50), "withCommunity": True, } return self._pagination_tweets( @@ -1520,7 +1542,7 @@ class TwitterAPI(): endpoint = ("/graphql/p048a9n3hTPppQyK7FQTFw" "/CommunitiesMainPageTimeline") variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "withCommunity": True, } return self._pagination_tweets( @@ -1530,7 +1552,7 @@ class TwitterAPI(): def home_timeline(self): endpoint = "/graphql/DXmgQYmIft1oLP6vMkJixw/HomeTimeline" variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "latestControlAvailable": True, "withCommunity": True, @@ -1541,7 +1563,7 @@ class TwitterAPI(): def home_latest_timeline(self): endpoint = "/graphql/SFxmNKWfN9ySJcXG_tjX8g/HomeLatestTimeline" variables = { - "count": 100, + "count": self.extractor.config("limit", 50), "includePromotedContent": False, "latestControlAvailable": True, } @@ -1568,7 +1590,7 @@ class TwitterAPI(): endpoint = "/graphql/06JtmwM8k_1cthpFZITVVA/ListLatestTweetsTimeline" variables = { "listId": list_id, - "count": 100, + "count": self.extractor.config("limit", 50), } return self._pagination_tweets( endpoint, variables, ("list", "tweets_timeline", "timeline")) @@ -1654,10 +1676,8 @@ class TwitterAPI(): self.extractor._assign_user(user) return user["rest_id"] except KeyError: - if "unavailable_message" in user: - raise exception.NotFoundError( - f"{user['unavailable_message'].get('text')} " - f"({user.get('reason')})", False) + if user and user.get("__typename") == "UserUnavailable": + raise exception.NotFoundError(user["message"], False) else: raise exception.NotFoundError("user") @@ -1700,7 +1720,7 @@ class TwitterAPI(): self.client_transaction.generate_transaction_id(method, path) def _call(self, endpoint, params, method="GET", auth=True, root=None): - url = (root or self.root) + endpoint + url = (self.root if root is None else root) + endpoint while True: if auth: @@ -1877,8 +1897,17 @@ class TwitterAPI(): features=None, field_toggles=None): extr = self.extractor original_retweets = (extr.retweets == "original") - pinned_tweet = extr.pinned + pinned_tweet = True if extr.pinned else None stop_tweets_max = stop_tweets + api_retries = None + + if isinstance(count := variables.get("count"), list): + count = count.copy() + count.reverse() + self.log.debug("Using 'count: %s'", count[-1]) + variables["count"] = count.pop() + else: + count = False params = {"variables": None} if cursor := extr._init_cursor(): @@ -1892,14 +1921,14 @@ class TwitterAPI(): while True: params["variables"] = self._json_dumps(variables) - data = self._call(endpoint, params)["data"] + data = self._call(endpoint, params) try: if path is None: - instructions = (data["user"]["result"]["timeline"] + instructions = (data["data"]["user"]["result"]["timeline"] ["timeline"]["instructions"]) else: - instructions = data + instructions = data["data"] for key in path: instructions = instructions[key] instructions = instructions["instructions"] @@ -1916,7 +1945,7 @@ class TwitterAPI(): elif instr_type == "TimelineAddToModule": entries = instr["moduleItems"] elif instr_type == "TimelinePinEntry": - if pinned_tweet: + if pinned_tweet is not None: pinned_tweet = instr["entry"] elif instr_type == "TimelineReplaceEntry": entry = instr["entry"] @@ -1930,6 +1959,26 @@ class TwitterAPI(): except LookupError: extr.log.debug(data) + if errors := data.get("errors"): + if api_retries is None: + api_tries = 1 + api_retries = extr.config("retries-api", 9) + if api_retries < 0: + api_retries = float("inf") + + err = [] + srv = False + for e in errors: + err.append(f"- '{e.get('message') or e.get('name')}'") + if e.get("source") == "Server": + srv = True + + self.log.warning("API errors (%s/%s):\n%s", + api_tries, api_retries+1, "\n".join(err)) + if srv and api_tries <= api_retries: + api_tries += 1 + continue + if user := extr._user_obj: user = user["legacy"] if user.get("blocked_by"): @@ -1950,14 +1999,13 @@ class TwitterAPI(): "Unable to retrieve Tweets from this timeline") tweets = [] - tweet = None + tweet = last_tweet = retry = None + api_tries = 1 - if pinned_tweet: - if isinstance(pinned_tweet, dict): - tweets.append(pinned_tweet) - elif instructions[-1]["type"] == "TimelinePinEntry": - tweets.append(instructions[-1]["entry"]) - pinned_tweet = False + if pinned_tweet is not None and isinstance(pinned_tweet, dict): + pinned_tweet["pinned"] = True + tweets.append(pinned_tweet) + pinned_tweet = None for entry in entries: esw = entry["entryId"].startswith @@ -1965,6 +2013,7 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) elif esw(("profile-grid-", + "search-grid-", "communities-grid-")): if "content" in entry: tweets.extend(entry["content"]["items"]) @@ -1988,6 +2037,28 @@ class TwitterAPI(): tweet = True cursor = cursor.get("value") + if pinned_tweet is not None: + if extr._user_obj is None: + pinned = None + elif pinned := extr._user_obj["legacy"].get( + "pinned_tweet_ids_str"): + pinned = f"-tweet-{pinned[0]}" + for idx, entry in enumerate(tweets): + if entry["entryId"].endswith(pinned): + # mark as pinned / set 'pinned = True' + pinned_tweet = ( + (entry.get("content") or entry["item"]) + ["itemContent"]["tweet_results"]["result"]) + if "tweet" in pinned_tweet: + pinned_tweet = pinned_tweet["tweet"] + pinned_tweet["pinned"] = True + # move to front of 'tweets' + del tweets[idx] + tweets.insert(0, entry) + break + del pinned + pinned_tweet = None + for entry in tweets: try: item = ((entry.get("content") or entry["item"]) @@ -2015,6 +2086,16 @@ class TwitterAPI(): (entry.get("entryId") or "").rpartition("-")[2]) continue + if retry is None: + try: + tweet["core"]["user_results"]["result"] + retry = False + except KeyError: + self.log.warning("Received Tweet results without " + "'core' data ... Retrying") + retry = True + break + if "retweeted_status_result" in legacy: try: retweet = legacy["retweeted_status_result"]["result"] @@ -2071,18 +2152,25 @@ class TwitterAPI(): tweet.get("rest_id")) continue - if tweet: + if retry: + continue + elif tweet: stop_tweets = stop_tweets_max last_tweet = tweet - else: - if stop_tweets <= 0: + elif stop_tweets <= 0: + if not count: return extr._update_cursor(None) + self.log.debug("Switching to 'count: %s'", count[-1]) + variables["count"] = count.pop() + continue + else: self.log.debug( "No Tweet results (%s/%s)", stop_tweets_max - stop_tweets + 1, stop_tweets_max) stop_tweets -= 1 if not cursor or cursor == variables.get("cursor"): + self.log.debug("No continuation cursor") return extr._update_cursor(None) if update_variables is None: @@ -2169,7 +2257,7 @@ class TwitterAPI(): else: variables["rawQuery"] = f"{query} {max_id}" - if prefix := self.extractor._cursor_prefix: + if prefix := getattr(self.extractor, "_cursor_prefix", None): self.extractor._cursor_prefix = \ f"{prefix.partition('_')[0]}_{tweet_id}/" variables["cursor"] = None |
