summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/exception.py6
-rw-r--r--gallery_dl/extractor/blogger.py16
-rw-r--r--gallery_dl/extractor/common.py44
-rw-r--r--gallery_dl/extractor/gelbooru.py4
-rw-r--r--gallery_dl/extractor/hitomi.py26
-rw-r--r--gallery_dl/extractor/instagram.py14
-rw-r--r--gallery_dl/extractor/kemonoparty.py4
-rw-r--r--gallery_dl/extractor/mangadex.py9
-rw-r--r--gallery_dl/extractor/newgrounds.py8
-rw-r--r--gallery_dl/extractor/philomena.py2
-rw-r--r--gallery_dl/extractor/reddit.py10
-rw-r--r--gallery_dl/extractor/sexcom.py3
-rw-r--r--gallery_dl/extractor/twitter.py500
-rw-r--r--gallery_dl/option.py5
-rw-r--r--gallery_dl/version.py4
15 files changed, 491 insertions, 164 deletions
diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py
index 0433dc9..5120039 100644
--- a/gallery_dl/exception.py
+++ b/gallery_dl/exception.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2021 Mike Fährmann
+# Copyright 2015-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -33,12 +33,12 @@ class GalleryDLException(Exception):
msgfmt = None
code = 1
- def __init__(self, message=None):
+ def __init__(self, message=None, fmt=True):
if not message:
message = self.default
elif isinstance(message, Exception):
message = "{}: {}".format(message.__class__.__name__, message)
- if self.msgfmt:
+ if self.msgfmt and fmt:
message = self.msgfmt.format(message)
Exception.__init__(self, message)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 9a86cc4..eef87f9 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -41,9 +41,11 @@ class BloggerExtractor(Extractor):
blog["date"] = text.parse_datetime(blog["published"])
del blog["selfLink"]
- sub = re.compile(r"/(?:s\d+|w\d+-h\d+)/").sub
+ sub = re.compile(r"(/|=)(?:s\d+|w\d+-h\d+)(?=/|$)").sub
findall_image = re.compile(
- r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall
+ r'src="(https?://(?:'
+ r'blogger\.googleusercontent\.com/img|'
+ r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
findall_video = re.compile(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
@@ -52,7 +54,7 @@ class BloggerExtractor(Extractor):
files = findall_image(content)
for idx, url in enumerate(files):
- files[idx] = sub("/s0/", url).replace("http:", "https:", 1)
+ files[idx] = sub(r"\1s0", url).replace("http:", "https:", 1)
if self.videos and 'id="BLOG_video-' in content:
page = self.request(post["url"]).text
@@ -137,6 +139,12 @@ class BloggerPostExtractor(BloggerExtractor):
("https://aaaninja.blogspot.com/2020/08/altera-boob-press-2.html", {
"pattern": r"https://1.bp.blogspot.com/.+/s0/altera_.+png",
}),
+ # new image domain (#2204)
+ (("https://randomthingsthroughmyletterbox.blogspot.com/2022/01"
+ "/bitter-flowers-by-gunnar-staalesen-blog.html"), {
+ "pattern": r"https://blogger.googleusercontent.com/img/a/.+=s0$",
+ "count": 8,
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index afe4a16..52e5199 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -220,6 +220,14 @@ class Extractor():
headers = session.headers
headers.clear()
+ source_address = self.config("source-address")
+ if source_address:
+ if isinstance(source_address, str):
+ source_address = (source_address, 0)
+ else:
+ source_address = (source_address[0], source_address[1])
+ session.mount("http://", SourceAdapter(source_address))
+
browser = self.config("browser") or self.browser
if browser and isinstance(browser, str):
browser, _, platform = browser.lower().partition(":")
@@ -235,10 +243,12 @@ class Extractor():
platform = "Macintosh; Intel Mac OS X 11.5"
if browser == "chrome":
- _emulate_browser_chrome(session, platform)
+ _emulate_browser_chrome(session, platform, source_address)
else:
- _emulate_browser_firefox(session, platform)
+ _emulate_browser_firefox(session, platform, source_address)
else:
+ if source_address:
+ session.mount("https://", SourceAdapter(source_address))
headers["User-Agent"] = self.config("user-agent", (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
"rv:91.0) Gecko/20100101 Firefox/91.0"))
@@ -605,26 +615,44 @@ class BaseExtractor(Extractor):
)
+class SourceAdapter(HTTPAdapter):
+
+ def __init__(self, source_address):
+ self.source_address = source_address
+ HTTPAdapter.__init__(self)
+
+ def init_poolmanager(self, *args, **kwargs):
+ kwargs["source_address"] = self.source_address
+ return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
+
+ def proxy_manager_for(self, *args, **kwargs):
+ kwargs["source_address"] = self.source_address
+ return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
+
+
class HTTPSAdapter(HTTPAdapter):
- def __init__(self, ciphers):
+ def __init__(self, ciphers, source_address=None):
context = self.ssl_context = ssl.create_default_context()
context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
context.set_ecdh_curve("prime256v1")
context.set_ciphers(ciphers)
+ self.source_address = source_address
HTTPAdapter.__init__(self)
def init_poolmanager(self, *args, **kwargs):
kwargs["ssl_context"] = self.ssl_context
+ kwargs["source_address"] = self.source_address
return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
kwargs["ssl_context"] = self.ssl_context
+ kwargs["source_address"] = self.source_address
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
-def _emulate_browser_firefox(session, platform):
+def _emulate_browser_firefox(session, platform, source_address):
headers = session.headers
headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:91.0) "
"Gecko/20100101 Firefox/91.0")
@@ -654,11 +682,12 @@ def _emulate_browser_firefox(session, platform):
"DHE-RSA-AES256-SHA:"
"AES128-SHA:"
"AES256-SHA:"
- "DES-CBC3-SHA"
+ "DES-CBC3-SHA",
+ source_address
))
-def _emulate_browser_chrome(session, platform):
+def _emulate_browser_chrome(session, platform, source_address):
if platform.startswith("Macintosh"):
platform = platform.replace(".", "_") + "_2"
@@ -690,7 +719,8 @@ def _emulate_browser_chrome(session, platform):
"AES256-GCM-SHA384:"
"AES128-SHA:"
"AES256-SHA:"
- "DES-CBC3-SHA"
+ "DES-CBC3-SHA",
+ source_address
))
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index a6bda52..fd26192 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2021 Mike Fährmann
+# Copyright 2014-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -59,7 +59,7 @@ class GelbooruBase():
@staticmethod
def _file_url(post):
url = post["file_url"]
- if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")):
+ if url.endswith((".webm", ".mp4")):
md5 = post["md5"]
path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5)
post["_fallback"] = GelbooruBase._video_fallback(path)
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index ce6c7ce..e132bf9 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -159,6 +159,7 @@ class HitomiTagExtractor(Extractor):
"""Extractor for galleries from tag searches on hitomi.la"""
category = "hitomi"
subcategory = "tag"
+ root = "https://hitomi.la"
pattern = (r"(?:https?://)?hitomi\.la/"
r"(tag|artist|group|series|type|character)/"
r"([^/?#]+)\.html")
@@ -183,12 +184,29 @@ class HitomiTagExtractor(Extractor):
self.tag = tag
def items(self):
- url = "https://ltn.hitomi.la/{}/{}.nozomi".format(self.type, self.tag)
data = {"_extractor": HitomiGalleryExtractor}
+ nozomi_url = "https://ltn.hitomi.la/{}/{}.nozomi".format(
+ self.type, self.tag)
+ headers = {
+ "Origin": self.root,
+ "Cache-Control": "max-age=0",
+ }
- for gallery_id in decode_nozomi(self.request(url).content):
- url = "https://hitomi.la/galleries/{}.html".format(gallery_id)
- yield Message.Queue, url, data
+ offset = 0
+ while True:
+ headers["Referer"] = "{}/{}/{}.html?page={}".format(
+ self.root, self.type, self.tag, offset // 100 + 1)
+ headers["Range"] = "bytes={}-{}".format(offset, offset+99)
+ nozomi = self.request(nozomi_url, headers=headers).content
+
+ for gallery_id in decode_nozomi(nozomi):
+ gallery_url = "{}/galleries/{}.html".format(
+ self.root, gallery_id)
+ yield Message.Queue, gallery_url, data
+
+ if len(nozomi) < 100:
+ return
+ offset += 100
@memcache()
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 781bf01..20a4c1a 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -748,13 +748,19 @@ class InstagramHighlightsExtractor(InstagramExtractor):
endpoint = "/v1/highlights/{}/highlights_tray/".format(user["id"])
tray = self._request_api(endpoint)["tray"]
-
reel_ids = [highlight["id"] for highlight in tray]
+
+ # Anything above 30 responds with statuscode 400.
+ # 30 can work, however, sometimes the API will respond with 560 or 500.
+ chunk_size = 5
endpoint = "/v1/feed/reels_media/"
- params = {"reel_ids": reel_ids}
- reels = self._request_api(endpoint, params=params)["reels"]
- return [reels[rid] for rid in reel_ids]
+ for offset in range(0, len(reel_ids), chunk_size):
+ chunk_ids = reel_ids[offset : offset+chunk_size]
+ params = {"reel_ids": chunk_ids}
+ reels = self._request_api(endpoint, params=params)["reels"]
+ for reel_id in chunk_ids:
+ yield reels[reel_id]
class InstagramReelsExtractor(InstagramExtractor):
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index f1d7bcf..beb992c 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021 Mike Fährmann
+# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -180,7 +180,7 @@ class KemonopartyExtractor(Extractor):
for dm in text.extract_iter(page, "<article", "</article>"):
dms.append({
"body": text.unescape(text.extract(
- dm, '<div class="dm-card__content">', '</div>',
+ dm, '<pre>', '</pre></section>',
)[0].strip()),
"date": text.extract(dm, 'datetime="', '"')[0],
})
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index ea5d4a8..152da4f 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2021 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -73,6 +73,7 @@ class MangadexExtractor(Extractor):
"lang" : lang,
"language": util.code_to_language(lang),
"count" : cattributes["pages"],
+ "_external_url": cattributes.get("externalUrl"),
}
data["artist"] = [artist["attributes"]["name"]
@@ -112,6 +113,12 @@ class MangadexChapterExtractor(MangadexExtractor):
chapter = self.api.chapter(self.uuid)
data = self._transform(chapter)
+ if data.get("_external_url"):
+ raise exception.StopExtraction(
+ "Chapter %s%s is not available on MangaDex and can instead be "
+ "read on the official publisher's website at %s.",
+ data["chapter"], data["chapter_minor"], data["_external_url"])
+
yield Message.Directory, data
data["_http_headers"] = self._headers
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 8bcbc20..54e2040 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2021 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -529,6 +529,12 @@ class NewgroundsSearchExtractor(NewgroundsExtractor):
self.query = text.parse_query(query)
def posts(self):
+ suitabilities = self.query.get("suitabilities")
+ if suitabilities:
+ data = {"view_suitability_" + s: "on"
+ for s in suitabilities.split(",")}
+ self.request(self.root + "/suitabilities",
+ method="POST", data=data)
return self._pagination("/search/conduct/" + self._path, self.query)
def metadata(self):
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 6377fb0..92b8113 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -172,7 +172,7 @@ class PhilomenaSearchExtractor(PhilomenaExtractor):
PhilomenaExtractor.__init__(self, match)
groups = match.groups()
if groups[-1]:
- q = groups[-1]
+ q = groups[-1].replace("+", " ")
for old, new in (
("-colon-" , ":"),
("-dash-" , "-"),
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 55c963d..f7809de 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2021 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -19,7 +19,7 @@ class RedditExtractor(Extractor):
directory_fmt = ("{category}", "{subreddit}")
filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}"
archive_fmt = "{filename}"
- cookiedomain = None
+ cookiedomain = ".reddit.com"
def items(self):
self.api = RedditAPI(self)
@@ -301,6 +301,12 @@ class RedditAPI():
else:
self.refresh_token = token
+ if not self.refresh_token:
+ # allow downloading from quarantined subreddits (#2180)
+ extractor._cookiejar.set(
+ "_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D',
+ domain=extractor.cookiedomain)
+
def submission(self, submission_id):
"""Fetch the (submission, comments)=-tuple for a submission id"""
endpoint = "/comments/" + submission_id + "/.json"
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 199b1ba..9f4bfc3 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -197,7 +197,7 @@ class SexcomSearchExtractor(SexcomExtractor):
subcategory = "search"
directory_fmt = ("{category}", "search", "{search[query]}")
pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:"
- r"(pic|gif|video)s/([^/?#]+)|search/(pic|gif|video)s"
+ r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s"
r")/?(?:\?([^#]+))?)")
test = (
("https://www.sex.com/search/pics?query=ecchi", {
@@ -208,6 +208,7 @@ class SexcomSearchExtractor(SexcomExtractor):
"range": "1-10",
"count": 10,
}),
+ ("https://www.sex.com/pics/?sort=popular&sub=all&page=1"),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index a49f1f2..f924292 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2021 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -39,7 +39,7 @@ class TwitterExtractor(Extractor):
self.pinned = self.config("pinned", False)
self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True)
- self.cards = self.config("cards", False)
+ self.cards = self.config("cards", True)
self._user_cache = {}
self._init_sizes()
@@ -56,32 +56,39 @@ class TwitterExtractor(Extractor):
def items(self):
self.login()
+ self.api = TwitterAPI(self)
metadata = self.metadata()
for tweet in self.tweets():
- if not self.retweets and "retweeted_status_id_str" in tweet:
- self.log.debug("Skipping %s (retweet)", tweet["id_str"])
+ if "legacy" in tweet:
+ data = tweet["legacy"]
+ else:
+ data = tweet
+
+ if not self.retweets and "retweeted_status_id_str" in data:
+ self.log.debug("Skipping %s (retweet)", data["id_str"])
continue
- if not self.quoted and "quoted_by_id_str" in tweet:
- self.log.debug("Skipping %s (quoted tweet)", tweet["id_str"])
+ if not self.quoted and "quoted_by_id_str" in data:
+ self.log.debug("Skipping %s (quoted tweet)", data["id_str"])
continue
- if "in_reply_to_user_id_str" in tweet and (
+ if "in_reply_to_user_id_str" in data and (
not self.replies or (
self.replies == "self" and
- tweet["in_reply_to_user_id_str"] != tweet["user_id_str"]
+ data["in_reply_to_user_id_str"] != data["user_id_str"]
)
):
- self.log.debug("Skipping %s (reply)", tweet["id_str"])
+ self.log.debug("Skipping %s (reply)", data["id_str"])
continue
files = []
- if "extended_entities" in tweet:
- self._extract_media(tweet, files)
+ if "extended_entities" in data:
+ self._extract_media(
+ data, data["extended_entities"]["media"], files)
if "card" in tweet and self.cards:
self._extract_card(tweet, files)
if self.twitpic:
- self._extract_twitpic(tweet, files)
+ self._extract_twitpic(data, files)
if not files and not self.textonly:
continue
@@ -95,8 +102,8 @@ class TwitterExtractor(Extractor):
text.nameext_from_url(url, file)
yield Message.Url, url, file
- def _extract_media(self, tweet, files):
- for media in tweet["extended_entities"]["media"]:
+ def _extract_media(self, tweet, entities, files):
+ for media in entities:
width = media["original_info"].get("width", 0)
height = media["original_info"].get("height", 0)
@@ -142,8 +149,17 @@ class TwitterExtractor(Extractor):
def _extract_card(self, tweet, files):
card = tweet["card"]
- if card["name"] in ("summary", "summary_large_image"):
+ if "legacy" in card:
+ card = card["legacy"]
+ name = card["name"]
+
+ if name in ("summary", "summary_large_image"):
bvals = card["binding_values"]
+ if isinstance(bvals, list):
+ bvals = {
+ bval["key"]: bval["value"]
+ for bval in card["binding_values"]
+ }
for prefix in ("photo_image_full_size_",
"summary_photo_image_",
"thumbnail_image_"):
@@ -154,8 +170,24 @@ class TwitterExtractor(Extractor):
if value and "url" in value:
files.append(value)
return
- elif self.videos:
- url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"])
+ elif name == "unified_card":
+ bvals = card["binding_values"]
+ if isinstance(bvals, list):
+ for bval in card["binding_values"]:
+ if bval["key"] == "unified_card":
+ bval = bval["value"]["string_value"]
+ break
+ else:
+ bval = bvals["unified_card"]["string_value"]
+ data = json.loads(bval)
+ if data.get("type") == "image_carousel_website":
+ self._extract_media(
+ tweet, data["media_entities"].values(), files)
+ return
+
+ if self.cards == "ytdl":
+ tweet_id = tweet.get("rest_id") or tweet["id_str"]
+ url = "ytdl:{}/i/web/status/{}".format(self.root, tweet_id)
files.append({"url": url})
def _extract_twitpic(self, tweet, files):
@@ -171,6 +203,15 @@ class TwitterExtractor(Extractor):
files.append({"url": url})
def _transform_tweet(self, tweet):
+ if "core" in tweet:
+ user = self._transform_user(
+ tweet["core"]["user_results"]["result"])
+ else:
+ user = self._transform_user(tweet["user"])
+
+ if "legacy" in tweet:
+ tweet = tweet["legacy"]
+
entities = tweet["entities"]
tdata = {
"tweet_id" : text.parse_int(tweet["id_str"]),
@@ -182,7 +223,7 @@ class TwitterExtractor(Extractor):
tweet.get("in_reply_to_status_id_str")),
"date" : text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
- "user" : self._transform_user(tweet["user"]),
+ "user" : user,
"lang" : tweet["lang"],
"favorite_count": tweet["favorite_count"],
"quote_count" : tweet["quote_count"],
@@ -224,11 +265,13 @@ class TwitterExtractor(Extractor):
def _transform_user(self, user):
try:
- return self._user_cache[user["id_str"]]
+ return self._user_cache[user.get("rest_id") or user["id_str"]]
except KeyError:
pass
- uid = user["id_str"]
+ uid = user.get("rest_id") or user["id_str"]
+ if "legacy" in user:
+ user = user["legacy"]
entities = user["entities"]
self._user_cache[uid] = udata = {
@@ -340,6 +383,10 @@ class TwitterTimelineExtractor(TwitterExtractor):
"range": "1-40",
"url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
}),
+ # suspended account (#2216)
+ ("https://twitter.com/realDonaldTrump", {
+ "exception": exception.NotFoundError,
+ }),
("https://mobile.twitter.com/supernaturepics?p=i"),
("https://www.twitter.com/id:2976459548"),
("https://twitter.com/i/user/2976459548"),
@@ -353,7 +400,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
self.user = "id:" + user_id
def tweets(self):
- return TwitterAPI(self).timeline_profile(self.user)
+ return self.api.user_tweets(self.user)
class TwitterRepliesExtractor(TwitterExtractor):
@@ -370,7 +417,7 @@ class TwitterRepliesExtractor(TwitterExtractor):
)
def tweets(self):
- return TwitterAPI(self).timeline_profile(self.user, replies=True)
+ return self.api.user_tweets_and_replies(self.user)
class TwitterMediaExtractor(TwitterExtractor):
@@ -387,7 +434,7 @@ class TwitterMediaExtractor(TwitterExtractor):
)
def tweets(self):
- return TwitterAPI(self).timeline_media(self.user)
+ return self.api.user_media(self.user)
class TwitterLikesExtractor(TwitterExtractor):
@@ -400,7 +447,7 @@ class TwitterLikesExtractor(TwitterExtractor):
return {"user_likes": self.user}
def tweets(self):
- return TwitterAPI(self).timeline_favorites(self.user)
+ return self.api.user_likes(self.user)
class TwitterBookmarkExtractor(TwitterExtractor):
@@ -410,7 +457,7 @@ class TwitterBookmarkExtractor(TwitterExtractor):
test = ("https://twitter.com/i/bookmarks",)
def tweets(self):
- return TwitterAPI(self).timeline_bookmark()
+ return self.api.user_bookmarks()
class TwitterListExtractor(TwitterExtractor):
@@ -424,7 +471,7 @@ class TwitterListExtractor(TwitterExtractor):
})
def tweets(self):
- return TwitterAPI(self).timeline_list(self.user)
+ return self.api.list_latest_tweets_timeline(self.user)
class TwitterListMembersExtractor(TwitterExtractor):
@@ -453,7 +500,7 @@ class TwitterFollowingExtractor(TwitterExtractor):
class TwitterSearchExtractor(TwitterExtractor):
- """Extractor for all images from a search timeline"""
+ """Extractor for Twitter search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)"
test = ("https://twitter.com/search?q=nature", {
@@ -466,7 +513,25 @@ class TwitterSearchExtractor(TwitterExtractor):
return {"search": text.unquote(self.user)}
def tweets(self):
- return TwitterAPI(self).search(text.unquote(self.user))
+ return self.api.search_adaptive(text.unquote(self.user))
+
+
+class TwitterEventExtractor(TwitterExtractor):
+ """Extractor for Tweets from a Twitter Event"""
+ subcategory = "event"
+ directory_fmt = ("{category}", "Events",
+ "{event[id]} {event[short_title]}")
+ pattern = BASE_PATTERN + r"/i/events/(\d+)"
+ test = ("https://twitter.com/i/events/1484669206993903616", {
+ "range": "1-20",
+ "count": ">5",
+ })
+
+ def metadata(self):
+ return {"event": self.api.live_event(self.user)}
+
+ def tweets(self):
+ return self.api.live_event_timeline(self.user)
class TwitterTweetExtractor(TwitterExtractor):
@@ -531,7 +596,7 @@ class TwitterTweetExtractor(TwitterExtractor):
}),
# TwitPic embeds (#579)
("https://twitter.com/i/web/status/112900228289540096", {
- "options": (("twitpic", True),),
+ "options": (("twitpic", True), ("cards", False)),
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
"count": 3,
}),
@@ -545,6 +610,16 @@ class TwitterTweetExtractor(TwitterExtractor):
"options": (("cards", True),),
"pattern": r"https://pbs.twimg.com/card_img/\d+/",
}),
+ # unified_card with image_carousel_website
+ ("https://twitter.com/doax_vv_staff/status/1479438945662685184", {
+ "options": (("cards", True),),
+ "pattern": r"https://pbs\.twimg\.com/media/F.+=png",
+ "count": 6,
+ }),
+ # unified_card without type
+ ("https://twitter.com/i/web/status/1466183847628865544", {
+ "count": 0,
+ }),
# original retweets (#1026)
("https://twitter.com/jessica_3978/status/1296304589591810048", {
"options": (("retweets", "original"),),
@@ -565,6 +640,10 @@ class TwitterTweetExtractor(TwitterExtractor):
"options": (("retweets", True),),
"count": 4,
}),
+ # deleted quote tweet (#2225)
+ ("https://twitter.com/i/web/status/1460044411165888515", {
+ "count": 0,
+ }),
)
def __init__(self, match):
@@ -573,8 +652,19 @@ class TwitterTweetExtractor(TwitterExtractor):
def tweets(self):
if self.config("conversations", False):
- return TwitterAPI(self).conversation(self.tweet_id)
- return TwitterAPI(self).tweet(self.tweet_id)
+ return self.api.tweet_detail(self.tweet_id)
+
+ tweets = []
+ tweet_id = self.tweet_id
+ for tweet in self.api.tweet_detail(tweet_id):
+ if tweet["rest_id"] == tweet_id or \
+ tweet.get("_retweet_id_str") == tweet_id:
+ tweets.append(tweet)
+
+ tweet_id = tweet["legacy"].get("quoted_status_id_str")
+ if not tweet_id:
+ break
+ return tweets
class TwitterImageExtractor(Extractor):
@@ -634,6 +724,7 @@ class TwitterAPI():
"include_mute_edge": "1",
"include_can_dm": "1",
"include_can_media_tag": "1",
+ "include_ext_has_nft_avatar": "1",
"skip_status": "1",
"cards_platform": "Web-12",
"include_cards": "1",
@@ -645,12 +736,30 @@ class TwitterAPI():
"include_user_entities": "true",
"include_ext_media_color": "true",
"include_ext_media_availability": "true",
+ "include_ext_sensitive_media_warning": "true",
"send_error_codes": "true",
"simple_quoted_tweet": "true",
"count": "100",
"cursor": None,
- "ext": "mediaStats,highlightedLabel",
+ "ext": "mediaStats,highlightedLabel,hasNftAvatar,"
+ "voiceInfo,superFollowMetadata",
+ }
+ self.variables = {
+ "includePromotedContent": False,
+ "withSuperFollowsUserFields": True,
+ "withBirdwatchPivots": False,
+ "withDownvotePerspective": False,
+ "withReactionsMetadata": False,
+ "withReactionsPerspective": False,
+ "withSuperFollowsTweetFields": True,
+ "withClientEventToken": False,
+ "withBirdwatchNotes": False,
+ "withVoice": True,
+ "withV2Timeline": False,
+ "__fs_interactive_text": False,
+ "__fs_dont_mention_me_view_api_enabled": False,
}
+ self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
cookies = extractor.session.cookies
cookiedomain = extractor.cookiedomain
@@ -671,54 +780,70 @@ class TwitterAPI():
cookies.set("gt", guest_token, domain=cookiedomain)
self.headers["x-guest-token"] = guest_token
- def tweet(self, tweet_id):
- endpoint = "/2/timeline/conversation/{}.json".format(tweet_id)
- tweets = []
- for tweet in self._pagination(endpoint):
- if tweet["id_str"] == tweet_id or \
- tweet.get("_retweet_id_str") == tweet_id:
- tweets.append(tweet)
- if "quoted_status_id_str" in tweet:
- tweet_id = tweet["quoted_status_id_str"]
- else:
- break
- return tweets
+ def tweet_detail(self, tweet_id):
+ endpoint = "/graphql/aD0-HB47XIOxiBl5kTkX5Q/TweetDetail"
+ variables = {
+ "focalTweetId": tweet_id,
+ "with_rux_injections": False,
+ "withCommunity": True,
+ "withQuickPromoteEligibilityTweetFields": True,
+ "withBirdwatchNotes": False,
+ }
+ return self._pagination_tweets(
+ endpoint, variables, ("threaded_conversation_with_injections",))
- def conversation(self, conversation_id):
- endpoint = "/2/timeline/conversation/{}.json".format(conversation_id)
- return self._pagination(endpoint)
+ def user_tweets(self, screen_name):
+ endpoint = "/graphql/LNhjy8t3XpIrBYM-ms7sPQ/UserTweets"
+ variables = {
+ "userId": self._user_id_by_screen_name(screen_name),
+ "count": 100,
+ "withQuickPromoteEligibilityTweetFields": True,
+ }
+ return self._pagination_tweets(endpoint, variables)
- def timeline_profile(self, screen_name, replies=False):
- user_id = self._user_id_by_screen_name(screen_name)
- endpoint = "/2/timeline/profile/{}.json".format(user_id)
- params = self.params.copy()
- params["include_tweet_replies"] = "true" if replies else "false"
- return self._pagination(endpoint, params)
+ def user_tweets_and_replies(self, screen_name):
+ endpoint = "/graphql/Vg5aF036K40ST3FWvnvRGA/UserTweetsAndReplies"
+ variables = {
+ "userId": self._user_id_by_screen_name(screen_name),
+ "count": 100,
+ "withCommunity": True,
+ }
+ return self._pagination_tweets(endpoint, variables)
- def timeline_media(self, screen_name):
- user_id = self._user_id_by_screen_name(screen_name)
- endpoint = "/2/timeline/media/{}.json".format(user_id)
- return self._pagination(endpoint)
+ def user_media(self, screen_name):
+ endpoint = "/graphql/Hl6C7ac051l_QBe3HjGz_A/UserMedia"
+ variables = {
+ "userId": self._user_id_by_screen_name(screen_name),
+ "count": 100,
+ }
+ return self._pagination_tweets(endpoint, variables)
- def timeline_favorites(self, screen_name):
- user_id = self._user_id_by_screen_name(screen_name)
- endpoint = "/2/timeline/favorites/{}.json".format(user_id)
- params = self.params.copy()
- params["sorted_by_time"] = "true"
- return self._pagination(endpoint)
+ def user_likes(self, screen_name):
+ endpoint = "/graphql/smISlRVSnz-GaU_XpU_akw/Likes"
+ variables = {
+ "userId": self._user_id_by_screen_name(screen_name),
+ "count": 100,
+ }
+ return self._pagination_tweets(endpoint, variables)
- def timeline_bookmark(self):
- endpoint = "/2/timeline/bookmark.json"
- return self._pagination(endpoint)
+ def user_bookmarks(self):
+ endpoint = "/graphql/yKNebSjZKbo2tOd-Qdc7Xg/Bookmarks"
+ variables = {
+ "count": 100,
+ }
+ return self._pagination_tweets(
+ endpoint, variables, ("bookmark_timeline", "timeline"))
- def timeline_list(self, list_id):
- endpoint = "/2/timeline/list.json"
- params = self.params.copy()
- params["list_id"] = list_id
- params["ranking_mode"] = "reverse_chronological"
- return self._pagination(endpoint, params)
+ def list_latest_tweets_timeline(self, list_id):
+ endpoint = "/graphql/RxUL5UHi4Msxt_P9O1729w/ListLatestTweetsTimeline"
+ variables = {
+ "listId": list_id,
+ "count": 100,
+ }
+ return self._pagination_tweets(
+ endpoint, variables, ("list", "tweets_timeline", "timeline"))
- def search(self, query):
+ def search_adaptive(self, query):
endpoint = "/2/search/adaptive.json"
params = self.params.copy()
params["q"] = query
@@ -726,55 +851,77 @@ class TwitterAPI():
params["query_source"] = "typed_query"
params["pc"] = "1"
params["spelling_corrections"] = "1"
- return self._pagination(endpoint, params)
+ return self._pagination_legacy(endpoint, params)
+
+ def live_event_timeline(self, event_id):
+ endpoint = "/2/live_event/timeline/{}.json".format(event_id)
+ params = self.params.copy()
+ params["timeline_id"] = "recap"
+ params["urt"] = "true"
+ params["get_annotations"] = "true"
+ return self._pagination_legacy(endpoint, params)
+
+ def live_event(self, event_id):
+ endpoint = "/1.1/live_event/1/{}/timeline.json".format(event_id)
+ params = self.params.copy()
+ params["count"] = "0"
+ params["urt"] = "true"
+ return (self._call(endpoint, params)
+ ["twitter_objects"]["live_events"][event_id])
def list_by_rest_id(self, list_id):
- endpoint = "/graphql/18MAHTcDU-TdJSjWWmoH7w/ListByRestId"
- params = {"variables": '{"listId":"' + list_id + '"'
- ',"withUserResult":false}'}
+ endpoint = "/graphql/BWEhzAk7k8TwbU4lKH2dpw/ListByRestId"
+ params = {"variables": self._json_dumps({
+ "listId": list_id,
+ "withSuperFollowsUserFields": True,
+ })}
try:
return self._call(endpoint, params)["data"]["list"]
except KeyError:
raise exception.NotFoundError("list")
def list_members(self, list_id):
- endpoint = "/graphql/tA7h9hy4U0Yc9COfIOh3qQ/ListMembers"
+ endpoint = "/graphql/kk9RQtSa2sc-4_9figZVBw/ListMembers"
variables = {
"listId": list_id,
- "count" : 100,
- "withTweetResult": False,
- "withUserResult" : False,
+ "count": 100,
+ "withSafetyModeUserFields": True,
}
- return self._pagination_graphql(
- endpoint, variables, "list", "members_timeline")
+ return self._pagination_users(
+ endpoint, variables, ("list", "members_timeline", "timeline"))
def user_following(self, screen_name):
- endpoint = "/graphql/Q_QTiPvoXwsA13eoA7okIQ/Following"
+ endpoint = "/graphql/kz464_e4MAOXc3bGOA9kow/Following"
variables = {
"userId": self._user_id_by_screen_name(screen_name),
- "count" : 100,
- "withTweetResult": False,
- "withUserResult" : False,
- "withTweetQuoteCount" : False,
- "withHighlightedLabel" : False,
- "includePromotedContent": False,
+ "count": 100,
}
- return self._pagination_graphql(
- endpoint, variables, "user", "following_timeline")
+ return self._pagination_users(endpoint, variables)
def user_by_screen_name(self, screen_name):
- endpoint = "/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName"
- params = {"variables": '{"screen_name":"' + screen_name + '"'
- ',"withHighlightedLabel":true}'}
- try:
- return self._call(endpoint, params)["data"]["user"]
- except KeyError:
- raise exception.NotFoundError("user")
+ endpoint = "/graphql/7mjxD3-C6BxitPMVQ6w0-Q/UserByScreenName"
+ params = {"variables": self._json_dumps({
+ "screen_name": screen_name,
+ "withSafetyModeUserFields": True,
+ "withSuperFollowsUserFields": True,
+ })}
+ return self._call(endpoint, params)["data"]["user"]["result"]
def _user_id_by_screen_name(self, screen_name):
if screen_name.startswith("id:"):
return screen_name[3:]
- return self.user_by_screen_name(screen_name)["rest_id"]
+
+ user = ()
+ try:
+ user = self.user_by_screen_name(screen_name)
+ return user["rest_id"]
+ except KeyError:
+ if "unavailable_message" in user:
+ raise exception.NotFoundError("{} ({})".format(
+ user["unavailable_message"].get("text"),
+ user.get("reason")), False)
+ else:
+ raise exception.NotFoundError("user")
@cache(maxage=3600)
def _guest_token(self):
@@ -782,7 +929,7 @@ class TwitterAPI():
endpoint = "/1.1/guest/activate.json"
return str(self._call(endpoint, None, root, "POST")["guest_token"])
- def _call(self, endpoint, params, root=None, method="GET"):
+ def _call(self, endpoint, params, root=None, method="GET", warning=True):
if root is None:
root = self.root
@@ -799,24 +946,16 @@ class TwitterAPI():
data = response.json()
if "errors" in data:
try:
- errors, warnings = [], []
- for error in data["errors"]:
- if error.get("kind") == "NonFatal":
- warnings.append(error["message"])
- else:
- errors.append(error["message"])
- errors = ", ".join(errors)
+ errors = ", ".join(e["message"] for e in data["errors"])
except Exception:
errors = data["errors"]
- if warnings:
- self.extractor.log.warning(", ".join(warnings))
- if errors and response.status_code < 400:
- raise exception.StopExtraction(errors)
else:
errors = ""
if response.status_code < 400:
# success
+ if errors and warning:
+ self.extractor.log.warning(errors)
return data
if response.status_code == 429:
@@ -846,11 +985,8 @@ class TwitterAPI():
raise exception.StopExtraction(
"%s %s (%s)", response.status_code, response.reason, errors)
- def _pagination(self, endpoint, params=None):
- if params is None:
- params = self.params.copy()
+ def _pagination_legacy(self, endpoint, params):
original_retweets = (self.extractor.retweets == "original")
- pinned_tweet = self.extractor.pinned
while True:
cursor = tweet = None
@@ -863,12 +999,6 @@ class TwitterAPI():
tweets = data["globalObjects"]["tweets"]
users = data["globalObjects"]["users"]
- if pinned_tweet:
- if "pinEntry" in instr[-1]:
- tweet_ids.append(instr[-1]["pinEntry"]["entry"]["content"]
- ["item"]["content"]["tweet"]["id"])
- pinned_tweet = False
-
# collect tweet IDs and cursor value
for entry in instr[0]["addEntries"]["entries"]:
entry_startswith = entry["entryId"].startswith
@@ -884,7 +1014,7 @@ class TwitterAPI():
elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
cursor = entry["content"]["operation"]["cursor"]
- if not cursor.get("stopOnEmptyResponse"):
+ if not cursor.get("stopOnEmptyResponse", True):
# keep going even if there are no tweets
tweet = True
cursor = cursor["value"]
@@ -939,23 +1069,133 @@ class TwitterAPI():
return
params["cursor"] = cursor
- def _pagination_graphql(self, endpoint, variables, key, timeline):
+ def _pagination_tweets(self, endpoint, variables, path=None):
+ variables.update(self.variables)
+ original_retweets = (self.extractor.retweets == "original")
+ pinned_tweet = self.extractor.pinned
+
+ while True:
+ params = {"variables": self._json_dumps(variables)}
+ data = self._call(endpoint, params)["data"]
+
+ try:
+ if path is None:
+ instructions = (data["user"]["result"]["timeline"]
+ ["timeline"]["instructions"])
+ else:
+ for key in path:
+ data = data[key]
+ instructions = data["instructions"]
+
+ entries = instructions[0]["entries"]
+ except (KeyError, IndexError):
+ return
+
+ tweets = []
+ tweet = cursor = None
+
+ if pinned_tweet:
+ pinned_tweet = False
+ if instructions[-1]["type"] == "TimelinePinEntry":
+ tweets.append(instructions[-1]["entry"])
+
+ for entry in entries:
+ esw = entry["entryId"].startswith
+
+ if esw("tweet-"):
+ tweets.append(entry)
+ elif esw("homeConversation-"):
+ tweets.extend(entry["content"]["items"])
+ elif esw("conversationthread-"):
+ tweets.extend(entry["content"]["items"])
+ elif esw("cursor-bottom-"):
+ cursor = entry["content"]
+ if not cursor.get("stopOnEmptyResponse", True):
+ # keep going even if there are no tweets
+ tweet = True
+ cursor = cursor.get("value")
+
+ for entry in tweets:
+ try:
+ tweet = ((entry.get("content") or entry["item"])
+ ["itemContent"]["tweet_results"]["result"])
+ legacy = tweet["legacy"]
+ except KeyError:
+ self.extractor.log.debug(
+ "Skipping %s (deleted)",
+ (entry.get("entryId") or "").rpartition("-")[2])
+ continue
+
+ if "retweeted_status_result" in legacy:
+ retweet = legacy["retweeted_status_result"]["result"]
+ if original_retweets:
+ try:
+ retweet["legacy"]["retweeted_status_id_str"] = \
+ retweet["rest_id"]
+ retweet["_retweet_id_str"] = tweet["rest_id"]
+ tweet = retweet
+ except KeyError:
+ continue
+ else:
+ try:
+ legacy["retweeted_status_id_str"] = \
+ retweet["rest_id"]
+ legacy["author"] = \
+ retweet["core"]["user_results"]["result"]
+ if "extended_entities" in retweet["legacy"] and \
+ "extended_entities" not in legacy:
+ legacy["extended_entities"] = \
+ retweet["legacy"]["extended_entities"]
+ except KeyError:
+ pass
+
+ yield tweet
+
+ if "quoted_status_result" in tweet:
+ try:
+ quoted = tweet["quoted_status_result"]["result"]
+ quoted["legacy"]["author"] = \
+ quoted["core"]["user_results"]["result"]
+ quoted["core"] = tweet["core"]
+ quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"]
+ yield quoted
+ except KeyError:
+ self.extractor.log.debug(
+ "Skipping quote of %s (deleted)",
+ tweet.get("rest_id"))
+ continue
+
+ if not tweet or not cursor:
+ return
+ variables["cursor"] = cursor
+
+ def _pagination_users(self, endpoint, variables, path=None):
+ variables.update(self.variables)
+
while True:
cursor = entry = stop = None
- params = {"variables": json.dumps(variables)}
- data = self._call(endpoint, params)
+ params = {"variables": self._json_dumps(variables)}
+ data = self._call(endpoint, params)["data"]
try:
- instructions = \
- data["data"][key][timeline]["timeline"]["instructions"]
+ if path is None:
+ instructions = (data["user"]["result"]["timeline"]
+ ["timeline"]["instructions"])
+ else:
+ for key in path:
+ data = data[key]
+ instructions = data["instructions"]
except KeyError:
- raise exception.AuthorizationError()
+ return
for instr in instructions:
if instr["type"] == "TimelineAddEntries":
for entry in instr["entries"]:
if entry["entryId"].startswith("user-"):
- yield entry["content"]["itemContent"]["user"]
+ user = (entry["content"]["itemContent"]
+ ["user_results"]["result"])
+ if "rest_id" in user:
+ yield user
elif entry["entryId"].startswith("cursor-bottom-"):
cursor = entry["content"]["value"]
elif instr["type"] == "TimelineTerminateTimeline":
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index cdfe9a1..e1ada09 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -124,6 +124,11 @@ def build_parser():
help="Use the specified proxy",
)
general.add_argument(
+ "--source-address",
+ dest="source-address", metavar="IP", action=ConfigAction,
+ help="Client-side IP address to bind to",
+ )
+ general.add_argument(
"--clear-cache",
dest="clear_cache", metavar="MODULE",
help="Delete cached login sessions, cookies, etc. for MODULE "
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 8fa7c22..1a399fa 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2021 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.20.1"
+__version__ = "1.20.3"