summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/twitter.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/twitter.py')
-rw-r--r--gallery_dl/extractor/twitter.py116
1 files changed, 97 insertions, 19 deletions
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 8105ede..dfafc1f 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -10,7 +10,7 @@
from .common import Extractor, Message
from .. import text, exception
-from ..cache import cache
+from ..cache import cache, memcache
import re
@@ -26,6 +26,7 @@ class TwitterExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1)
+ self.logged_in = False
self.retweets = self.config("retweets", True)
self.content = self.config("content", False)
self.videos = self.config("videos", False)
@@ -53,10 +54,20 @@ class TwitterExtractor(Extractor):
yield Message.Urllist, urls, data
if self.videos and "-videoContainer" in tweet:
+ if self.videos == "ytdl":
+ data["extension"] = None
+ url = "ytdl:{}/{}/status/{}".format(
+ self.root, data["user"], data["tweet_id"])
+ else:
+ url = self._video_from_tweet(data["tweet_id"])
+ ext = text.ext_from_url(url)
+ if ext == "m3u8":
+ url = "ytdl:" + url
+ data["extension"] = "mp4"
+ data["_ytdl_extra"] = {"protocol": "m3u8_native"}
+ else:
+ data["extension"] = ext
data["num"] = 1
- data["extension"] = None
- url = "ytdl:{}/{}/status/{}".format(
- self.root, data["user"], data["tweet_id"])
yield Message.Url, url, data
def metadata(self):
@@ -70,6 +81,7 @@ class TwitterExtractor(Extractor):
username, password = self._get_auth_info()
if username:
self._update_cookies(self._login_impl(username, password))
+ self.logged_in = True
@cache(maxage=360*24*3600, keyarg=1)
def _login_impl(self, username, password):
@@ -115,17 +127,48 @@ class TwitterExtractor(Extractor):
data["content"] = cl if cl and len(cr) < 16 else content
return data
- def _tweets_from_api(self, url):
+ def _video_from_tweet(self, tweet_id):
+ url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format(
+ tweet_id)
+ cookies = None
+ headers = {
+ "Origin" : self.root,
+ "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id),
+ "x-csrf-token" : self.session.cookies.get("ct0"),
+ "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM"
+ "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N"
+ "HfOPqkca3qaAxGfsyKCs0wRbw",
+ }
+
+ if self.logged_in:
+ headers["x-twitter-auth-type"] = "OAuth2Session"
+ else:
+ token = self._guest_token(headers)
+ cookies = {"gt": token}
+ headers["x-guest-token"] = token
+
+ data = self.request(url, cookies=cookies, headers=headers).json()
+ return data["track"]["playbackUrl"]
+
+ @memcache()
+ def _guest_token(self, headers):
+ return self.request(
+ "https://api.twitter.com/1.1/guest/activate.json",
+ method="POST", headers=headers,
+ ).json().get("guest_token")
+
+ def _tweets_from_api(self, url, max_position=None):
params = {
"include_available_features": "1",
"include_entities": "1",
+ "max_position": max_position,
"reset_error_state": "false",
"lang": "en",
}
headers = {
"X-Requested-With": "XMLHttpRequest",
"X-Twitter-Active-User": "yes",
- "Referer": "{}/{}".format(self.root, self.user)
+ "Referer": self.root + "/",
}
while True:
@@ -140,18 +183,23 @@ class TwitterExtractor(Extractor):
if not data["has_more_items"]:
return
- position = text.parse_int(text.extract(
- tweet, 'data-tweet-id="', '"')[0])
- if "max_position" in params and position >= params["max_position"]:
- return
- params["max_position"] = position
+ if "min_position" in data:
+ position = data["min_position"]
+ if position == max_position:
+ return
+ else:
+ position = text.parse_int(text.extract(
+ tweet, 'data-tweet-id="', '"')[0])
+ if max_position and position >= max_position:
+ return
+ params["max_position"] = max_position = position
class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for all images from a user's timeline"""
subcategory = "timeline"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
- r"/([^/?&#]+)/?(?:$|[?#])")
+ r"/(?!search)([^/?&#]+)/?(?:$|[?#])")
test = (
("https://twitter.com/supernaturepics", {
"range": "1-40",
@@ -171,7 +219,7 @@ class TwitterMediaExtractor(TwitterExtractor):
"""Extractor for all images from a user's Media Tweets"""
subcategory = "media"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
- r"/([^/?&#]+)/media(?!\w)")
+ r"/(?!search)([^/?&#]+)/media(?!\w)")
test = (
("https://twitter.com/supernaturepics/media", {
"range": "1-40",
@@ -186,6 +234,26 @@ class TwitterMediaExtractor(TwitterExtractor):
return self._tweets_from_api(url)
+class TwitterSearchExtractor(TwitterExtractor):
+ """Extractor for all images from a search timeline"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Search", "{search}")
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/search/?\?(?:[^&#]+&)*q=([^&#]+)")
+ test = ("https://twitter.com/search?q=nature", {
+ "range": "1-40",
+ "count": 40,
+ })
+
+ def metadata(self):
+ return {"search": self.user}
+
+ def tweets(self):
+ url = "{}/i/search/timeline?f=tweets&q={}".format(
+ self.root, self.user)
+ return self._tweets_from_api(url, "-1")
+
+
class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for images from individual tweets"""
subcategory = "tweet"
@@ -205,17 +273,17 @@ class TwitterTweetExtractor(TwitterExtractor):
# video
("https://twitter.com/perrypumas/status/1065692031626829824", {
"options": (("videos", True),),
- "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+",
+ "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8",
}),
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/yumi_san0112/status/1151144618936823808", {
"options": (("content", True),),
- "keyword": "b13b6c4cd0b0c15b2ea7685479e7fedde3c47b9e",
+ "keyword": "b133464b73aec33871521ab021a3166204194285",
}),
# Reply to another tweet (#403)
("https://twitter.com/tyson_hesse/status/1103767554424598528", {
- "options": (("videos", True),),
- "pattern": r"ytdl:https://twitter.com/.*/1103767554424598528$",
+ "options": (("videos", "ytdl"),),
+ "pattern": r"ytdl:https://twitter.com/.+/1103767554424598528",
}),
# /i/web/ URL
("https://twitter.com/i/web/status/1155074198240292865", {
@@ -231,9 +299,19 @@ class TwitterTweetExtractor(TwitterExtractor):
return {"user": self.user, "tweet_id": self.tweet_id}
def tweets(self):
- self.session.cookies.clear()
url = "{}/i/web/status/{}".format(self.root, self.tweet_id)
- page = self.request(url).text
+ cookies = {"app_shell_visited": "1"}
+ headers = {
+ "Referer" : url,
+ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; "
+ "Trident/7.0; rv:11.0) like Gecko",
+ }
+
+ response = self.request(url, cookies=cookies, headers=headers)
+ if response.history and response.url == self.root + "/":
+ raise exception.AuthorizationError()
+ page = response.text
+
end = page.index('class="js-tweet-stats-container')
beg = page.rindex('<div class="tweet ', 0, end)
return (page[beg:end],)