diff options
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 27 |
1 files changed, 25 insertions, 2 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 5e68f13..c47021e 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -13,6 +13,7 @@ from .. import text, util, exception from ..cache import cache import itertools import json +import re BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com" @@ -75,6 +76,10 @@ class TwitterExtractor(Extractor): else: seen_tweets = None + if self.twitpic: + self._find_twitpic = re.compile( + r"https?(://twitpic\.com/(?!photos/)\w+)").findall + for tweet in self.tweets(): if "legacy" in tweet: @@ -231,12 +236,24 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _extract_twitpic(self, tweet, files): - for url in tweet["entities"].get("urls", ()): + urls = {} + + # collect URLs from entities + for url in tweet["entities"].get("urls") or (): url = url["expanded_url"] if "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] + urls[url] = None + + # collect URLs from text + for url in self._find_twitpic( + tweet.get("full_text") or tweet.get("text") or ""): + urls["https" + url] = None + + # extract actual URLs + for url in urls: response = self.request(url, fatal=False) if response.status_code >= 400: continue @@ -781,7 +798,13 @@ class TwitterTweetExtractor(TwitterExtractor): ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True), ("cards", False)), "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", - "count": 3, + "count": 2, # 1 duplicate + }), + # TwitPic URL not in 'urls' (#3792) + ("https://twitter.com/shimoigusaP/status/8138669971", { + "options": (("twitpic", True),), + "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.png", + "count": 1, }), # Twitter card (#1005) ("https://twitter.com/billboard/status/1306599586602135555", { |
