summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/4archive.py111
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/behance.py3
-rw-r--r--gallery_dl/extractor/common.py2
-rw-r--r--gallery_dl/extractor/exhentai.py83
-rw-r--r--gallery_dl/extractor/fantia.py2
-rw-r--r--gallery_dl/extractor/hitomi.py2
-rw-r--r--gallery_dl/extractor/idolcomplex.py1
-rw-r--r--gallery_dl/extractor/instagram.py8
-rw-r--r--gallery_dl/extractor/kemonoparty.py33
-rw-r--r--gallery_dl/extractor/misskey.py4
-rw-r--r--gallery_dl/extractor/nijie.py1
-rw-r--r--gallery_dl/extractor/patreon.py69
-rw-r--r--gallery_dl/extractor/pixiv.py4
-rw-r--r--gallery_dl/extractor/reddit.py23
-rw-r--r--gallery_dl/extractor/sankaku.py2
-rw-r--r--gallery_dl/extractor/twitter.py62
-rw-r--r--gallery_dl/extractor/weibo.py6
18 files changed, 329 insertions, 88 deletions
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py
new file mode 100644
index 0000000..d198369
--- /dev/null
+++ b/gallery_dl/extractor/4archive.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://4archive.org/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+
+class _4archiveThreadExtractor(Extractor):
+ """Extractor for 4archive threads"""
+ category = "4archive"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} {title}")
+ filename_fmt = "{no} {filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{no}"
+ root = "https://4archive.org"
+ referer = False
+ pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)"
+ example = "https://4archive.org/board/a/thread/12345/"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/board/{}/thread/{}".format(
+ self.root, self.board, self.thread)
+ page = self.request(url).text
+ data = self.metadata(page)
+ posts = self.posts(page)
+
+ if not data["title"]:
+ data["title"] = posts[0]["com"][:50]
+
+ for post in posts:
+ post.update(data)
+ post["time"] = int(util.datetime_to_timestamp(post["date"]))
+ yield Message.Directory, post
+ if "url" in post:
+ yield Message.Url, post["url"], text.nameext_from_url(
+ post["filename"], post)
+
+ def metadata(self, page):
+ return {
+ "board" : self.board,
+ "thread": text.parse_int(self.thread),
+ "title" : text.unescape(text.extr(
+ page, 'class="subject">', "</span>"))
+ }
+
+ def posts(self, page):
+ return [
+ self.parse(post)
+ for post in page.split('class="postContainer')[1:]
+ ]
+
+ @staticmethod
+ def parse(post):
+ extr = text.extract_from(post)
+ data = {
+ "name": extr('class="name">', "</span>"),
+ "date": text.parse_datetime(
+ extr('class="dateTime postNum" >', "<").strip(),
+ "%Y-%m-%d %H:%M:%S"),
+ "no" : text.parse_int(extr('href="#p', '"')),
+ }
+ if 'class="file"' in post:
+ extr('class="fileText"', ">File: <a")
+ data.update({
+ "url" : extr('href="', '"'),
+ "filename": extr(
+ 'rel="noreferrer noopener"', "</a>").strip()[1:],
+ "size" : text.parse_bytes(extr(" (", ", ")[:-1]),
+ "width" : text.parse_int(extr("", "x")),
+ "height" : text.parse_int(extr("", "px")),
+ })
+ extr("<blockquote ", "")
+ data["com"] = text.unescape(text.remove_html(
+ extr(">", "</blockquote>")))
+ return data
+
+
+class _4archiveBoardExtractor(Extractor):
+ """Extractor for 4archive boards"""
+ category = "4archive"
+ subcategory = "board"
+ root = "https://4archive.org"
+ pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$"
+ example = "https://4archive.org/board/a/"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board = match.group(1)
+ self.num = text.parse_int(match.group(2), 1)
+
+ def items(self):
+ data = {"_extractor": _4archiveThreadExtractor}
+ while True:
+ url = "{}/board/{}/{}".format(self.root, self.board, self.num)
+ page = self.request(url).text
+ if 'class="thread"' not in page:
+ return
+ for thread in text.extract_iter(page, 'class="thread" id="t', '"'):
+ url = "{}/board/{}/thread/{}".format(
+ self.root, self.board, thread)
+ yield Message.Queue, url, data
+ self.num += 1
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 1c1473a..22e4fe3 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -15,6 +15,7 @@ modules = [
"35photo",
"3dbooru",
"4chan",
+ "4archive",
"4chanarchives",
"500px",
"8chan",
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index fc5f9ef..a92918e 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -170,7 +170,8 @@ class BehanceGalleryExtractor(BehanceExtractor):
elif mtype == "EmbedModule":
embed = module.get("originalEmbed") or module.get("fluidEmbed")
if embed:
- append(("ytdl:" + text.extr(embed, 'src="', '"'), module))
+ embed = text.unescape(text.extr(embed, 'src="', '"'))
+ append(("ytdl:" + embed, module))
return result
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 0d67df7..3bec424 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -35,6 +35,7 @@ class Extractor():
root = ""
cookies_domain = ""
referer = True
+ ciphers = None
tls12 = True
browser = None
request_interval = 0.0
@@ -305,6 +306,7 @@ class Extractor():
headers["User-Agent"] = useragent
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
+ ssl_ciphers = self.ciphers
if BROTLI:
headers["Accept-Encoding"] = "gzip, deflate, br"
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 44bfe7d..182910c 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -27,6 +27,7 @@ class ExhentaiExtractor(Extractor):
cookies_names = ("ipb_member_id", "ipb_pass_hash")
root = "https://exhentai.org"
request_interval = 5.0
+ ciphers = "DEFAULT:!DH"
LIMIT = False
@@ -112,12 +113,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
- self.key = {}
- self.count = 0
self.gallery_id = text.parse_int(match.group(2) or match.group(5))
self.gallery_token = match.group(3)
self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1)
+ self.key_start = None
+ self.key_show = None
+ self.key_next = None
+ self.api_url = ""
+ self.count = 0
def _init(self):
source = self.config("source")
@@ -145,17 +149,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
gpage = self._gallery_page()
self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
if not self.image_token:
- self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage)
- return
+ raise exception.StopExtraction(
+ "Failed to extract initial image token")
ipage = self._image_page()
else:
ipage = self._image_page()
part = text.extr(ipage, 'hentai.org/g/', '"')
if not part:
- self.log.error("Failed to extract gallery token")
self.log.debug("Page content:\n%s", ipage)
- return
+ raise exception.StopExtraction(
+ "Failed to extract gallery token")
self.gallery_token = part.split("/")[1]
gpage = self._gallery_page()
@@ -176,7 +180,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data.update(image)
if self.limits:
self._check_limits(data)
- if "/fullimg.php" in url:
+ if "/fullimg" in url:
data["_http_validate"] = _validate_response
else:
data["_http_validate"] = None
@@ -208,6 +212,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def metadata_from_page(self, page):
extr = text.extract_from(page)
+ self.api_url = extr('var api_url = "', '"') or (self.root + "/api.php")
+
data = {
"gid" : self.gallery_id,
"token" : self.gallery_token,
@@ -225,7 +231,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
'>Visible:</td><td class="gdt2">', '<'),
"language" : extr('>Language:</td><td class="gdt2">', ' '),
"filesize" : text.parse_bytes(extr(
- '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
+ '>File Size:</td><td class="gdt2">', '<').rstrip("Bbi")),
"filecount" : extr('>Length:</td><td class="gdt2">', ' '),
"favorites" : extr('id="favcount">', ' '),
"rating" : extr(">Average: ", "<"),
@@ -251,14 +257,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
return data
def metadata_from_api(self):
- url = self.root + "/api.php"
data = {
- "method": "gdata",
- "gidlist": ((self.gallery_id, self.gallery_token),),
+ "method" : "gdata",
+ "gidlist" : ((self.gallery_id, self.gallery_token),),
"namespace": 1,
}
- data = self.request(url, method="POST", json=data).json()
+ data = self.request(self.api_url, method="POST", json=data).json()
if "error" in data:
raise exception.StopExtraction(data["error"])
@@ -269,54 +274,70 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
extr = text.extract_from(page, pos)
- self.key["next"] = extr("'", "'")
+ self.key_next = extr("'", "'")
iurl = extr('<img id="img" src="', '"')
- orig = extr('hentai.org/fullimg.php', '"')
+ nl = extr(" nl(", ")").strip("\"'")
+ orig = extr('hentai.org/fullimg', '"')
try:
if self.original and orig:
- url = self.root + "/fullimg.php" + text.unescape(orig)
+ url = self.root + "/fullimg" + text.unescape(orig)
data = self._parse_original_info(extr('ownload original', '<'))
+ data["_fallback"] = ("{}?nl={}".format(url, nl),)
else:
url = iurl
data = self._parse_image_info(url)
+ data["_fallback"] = self._fallback(
+ None, self.image_num, nl)
except IndexError:
self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction(
"Unable to parse image info for '%s'", url)
data["num"] = self.image_num
- data["image_token"] = self.key["start"] = extr('var startkey="', '";')
- self.key["show"] = extr('var showkey="', '";')
+ data["image_token"] = self.key_start = extr('var startkey="', '";')
+ self.key_show = extr('var showkey="', '";')
self._check_509(iurl, data)
- return url, text.nameext_from_url(iurl, data)
+ return url, text.nameext_from_url(url, data)
def images_from_api(self):
"""Get image url and data from api calls"""
- api_url = self.root + "/api.php"
- nextkey = self.key["next"]
+ api_url = self.api_url
+ nextkey = self.key_next
request = {
"method" : "showpage",
"gid" : self.gallery_id,
+ "page" : 0,
"imgkey" : nextkey,
- "showkey": self.key["show"],
+ "showkey": self.key_show,
}
+
for request["page"] in range(self.image_num + 1, self.count + 1):
page = self.request(api_url, method="POST", json=request).json()
+
+ i3 = page["i3"]
+ i6 = page["i6"]
+
imgkey = nextkey
- nextkey, pos = text.extract(page["i3"], "'", "'")
- imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
- origurl, pos = text.extract(page["i7"], '<a href="', '"')
+ nextkey, pos = text.extract(i3, "'", "'")
+ imgurl , pos = text.extract(i3, 'id="img" src="', '"', pos)
+ nl , pos = text.extract(i3, " nl(", ")", pos)
+ nl = (nl or "").strip("\"'")
try:
- if self.original and origurl:
+ pos = i6.find("hentai.org/fullimg")
+ if self.original and pos >= 0:
+ origurl, pos = text.rextract(i6, '"', '"', pos)
url = text.unescape(origurl)
data = self._parse_original_info(text.extract(
- page["i7"], "ownload original", "<", pos)[0])
+ i6, "ownload original", "<", pos)[0])
+ data["_fallback"] = ("{}?nl={}".format(url, nl),)
else:
url = imgurl
data = self._parse_image_info(url)
+ data["_fallback"] = self._fallback(
+ imgkey, request["page"], nl)
except IndexError:
self.log.debug("Page content:\n%s", page)
raise exception.StopExtraction(
@@ -326,7 +347,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["image_token"] = imgkey
self._check_509(imgurl, data)
- yield url, text.nameext_from_url(imgurl, data)
+ yield url, text.nameext_from_url(url, data)
request["imgkey"] = nextkey
@@ -390,6 +411,14 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.NotFoundError("image page")
return page
+ def _fallback(self, imgkey, num, nl):
+ url = "{}/s/{}/{}-{}?nl={}".format(
+ self.root, imgkey or self.key_start, self.gallery_id, num, nl)
+ page = self.request(url, fatal=False).text
+ if page.startswith(("Invalid page", "Keep trying")):
+ return
+ yield self.image_from_page(page)[0]
+
@staticmethod
def _parse_image_info(url):
for part in url.split("/")[4:]:
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index 4a67695..6218f19 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -108,7 +108,7 @@ class FantiaExtractor(Extractor):
"fanclub_user_name": resp["fanclub"]["user"]["name"],
"fanclub_name": resp["fanclub"]["name"],
"fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]),
- "tags": resp["tags"],
+ "tags": [t["name"] for t in resp["tags"]],
"_data": resp,
}
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index bc49ca3..88f5708 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
category = "hitomi"
root = "https://hitomi.la"
pattern = (r"(?:https?://)?hitomi\.la"
- r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)"
+ r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)"
r"/(?:[^/?#]+-)?(\d+)")
example = "https://hitomi.la/manga/TITLE-867789.html"
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index 16e4097..b7b6ef1 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -22,6 +22,7 @@ class IdolcomplexExtractor(SankakuExtractor):
cookies_domain = "idol.sankakucomplex.com"
cookies_names = ("login", "pass_hash")
root = "https://" + cookies_domain
+ referer = False
request_interval = 5.0
def __init__(self, match):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index c704183..b0789be 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -778,13 +778,15 @@ class InstagramRestAPI():
kwargs["headers"] = {
"Accept" : "*/*",
"X-CSRFToken" : extr.csrf_token,
- "X-Instagram-AJAX": "1006242110",
"X-IG-App-ID" : "936619743392459",
- "X-ASBD-ID" : "198387",
+ "X-ASBD-ID" : "129477",
"X-IG-WWW-Claim" : extr.www_claim,
"X-Requested-With": "XMLHttpRequest",
- "Alt-Used" : "www.instagram.com",
+ "Connection" : "keep-alive",
"Referer" : extr.root + "/",
+ "Sec-Fetch-Dest" : "empty",
+ "Sec-Fetch-Mode" : "cors",
+ "Sec-Fetch-Site" : "same-origin",
}
return extr.request(url, **kwargs).json()
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 1596cfb..cba6211 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -24,7 +24,7 @@ class KemonopartyExtractor(Extractor):
category = "kemonoparty"
root = "https://kemono.party"
directory_fmt = ("{category}", "{service}", "{user}")
- filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}"
+ filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}"
archive_fmt = "{service}_{user}_{id}_{num}"
cookies_domain = ".kemono.party"
@@ -69,8 +69,9 @@ class KemonopartyExtractor(Extractor):
headers["Referer"] = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
post["_http_headers"] = headers
- post["date"] = text.parse_datetime(
- post["published"] or post["added"], "%Y-%m-%dT%H:%M:%S")
+ post["date"] = self._parse_datetime(
+ post["published"] or post["added"])
+
if username:
post["username"] = username
if comments:
@@ -205,6 +206,11 @@ class KemonopartyExtractor(Extractor):
})
return dms
+ def _parse_datetime(self, date_string):
+ if len(date_string) > 19:
+ date_string = date_string[:19]
+ return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S")
+
@memcache(keyarg=1)
def _discord_channels(self, server):
url = "{}/api/v1/discord/channel/lookup/{}".format(
@@ -213,7 +219,14 @@ class KemonopartyExtractor(Extractor):
@memcache(keyarg=1)
def _post_revisions(self, url):
- return self.request(url + "/revisions").json()
+ revs = self.request(url + "/revisions").json()
+
+ idx = len(revs)
+ for rev in revs:
+ rev["revision_index"] = idx
+ idx -= 1
+
+ return revs
def _validate(response):
@@ -247,13 +260,15 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
if revisions:
for post in posts:
post["revision_id"] = 0
- yield post
post_url = "{}/post/{}".format(self.api_url, post["id"])
try:
revs = self._post_revisions(post_url)
except exception.HttpError:
- pass
+ post["revision_index"] = 1
+ yield post
else:
+ post["revision_index"] = len(revs) + 1
+ yield post
yield from revs
else:
yield from posts
@@ -286,8 +301,9 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
try:
revs = self._post_revisions(self.api_url)
except exception.HttpError:
- pass
+ post["revision_index"] = 1
else:
+ post["revision_index"] = len(revs) + 1
return itertools.chain((post,), revs)
return (post,)
@@ -360,8 +376,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
"name": path, "type": "inline", "hash": ""})
post["channel_name"] = self.channel_name
- post["date"] = text.parse_datetime(
- post["published"], "%Y-%m-%dT%H:%M:%S.%f")
+ post["date"] = self._parse_datetime(post["published"])
post["count"] = len(files)
yield Message.Directory, post
diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py
index 95b83b6..5385f8a 100644
--- a/gallery_dl/extractor/misskey.py
+++ b/gallery_dl/extractor/misskey.py
@@ -70,6 +70,10 @@ BASE_PATTERN = MisskeyExtractor.update({
"root": "https://misskey.io",
"pattern": r"misskey\.io",
},
+ "misskey.design": {
+ "root": "https://misskey.design",
+ "pattern": r"misskey\.design",
+ },
"lesbian.energy": {
"root": "https://lesbian.energy",
"pattern": r"lesbian\.energy",
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index b902404..76c5404 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -19,6 +19,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
directory_fmt = ("{category}", "{user_id}")
filename_fmt = "{image_id}_p{num}.{extension}"
archive_fmt = "{image_id}_{num}"
+ request_interval = (1.0, 2.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 6ac9a83..6aef9cb 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -249,8 +249,23 @@ class PatreonExtractor(Extractor):
return [genmap[ft] for ft in filetypes]
def _extract_bootstrap(self, page):
- return util.json_loads(text.extr(
- page, "window.patreon.bootstrap,", "});") + "}")
+ bootstrap = text.extr(
+ page, 'window.patreon = {"bootstrap":', '},"apiServer"')
+ if bootstrap:
+ return util.json_loads(bootstrap + "}")
+
+ bootstrap = text.extr(page, "window.patreon.bootstrap,", "});")
+ if bootstrap:
+ return util.json_loads(bootstrap + "}")
+
+ data = text.extr(page, "window.patreon = {", "};\n")
+ if data:
+ try:
+ return util.json_loads("{" + data + "}")["bootstrap"]
+ except Exception:
+ pass
+
+ raise exception.StopExtraction("Unable to extract bootstrap data")
class PatreonCreatorExtractor(PatreonExtractor):
@@ -267,34 +282,52 @@ class PatreonCreatorExtractor(PatreonExtractor):
def posts(self):
query = text.parse_query(self.query)
+ campaign_id = self._get_campaign_id(query)
+ filters = self._get_filters(query)
+
+ self.log.debug("campaign_id: %s", campaign_id)
+
+ url = self._build_url("posts", (
+ "&filter[campaign_id]=" + campaign_id +
+ "&filter[contains_exclusive_posts]=true"
+ "&filter[is_draft]=false" + filters +
+ "&sort=" + query.get("sort", "-published_at")
+ ))
+ return self._pagination(url)
- creator_id = query.get("u")
- if creator_id:
- url = "{}/user/posts?u={}".format(self.root, creator_id)
+ def _get_campaign_id(self, query):
+ if self.creator.startswith("id:"):
+ return self.creator[3:]
+
+ campaign_id = query.get("c") or query.get("campaign_id")
+ if campaign_id:
+ return campaign_id
+
+ user_id = query.get("u")
+ if user_id:
+ url = "{}/user/posts?u={}".format(self.root, user_id)
else:
url = "{}/{}/posts".format(self.root, self.creator)
page = self.request(url, notfound="creator").text
try:
+ data = None
data = self._extract_bootstrap(page)
- campaign_id = data["campaign"]["data"]["id"]
- except (KeyError, ValueError):
- raise exception.NotFoundError("creator")
-
- filters = "".join(
+ return data["campaign"]["data"]["id"]
+ except (KeyError, ValueError) as exc:
+ if data:
+ self.log.debug(data)
+ raise exception.StopExtraction(
+ "Unable to extract campaign ID (%s: %s)",
+ exc.__class__.__name__, exc)
+
+ def _get_filters(self, query):
+ return "".join(
"&filter[{}={}".format(key[8:], text.escape(value))
for key, value in query.items()
if key.startswith("filters[")
)
- url = self._build_url("posts", (
- "&filter[campaign_id]=" + campaign_id +
- "&filter[contains_exclusive_posts]=true"
- "&filter[is_draft]=false" + filters +
- "&sort=" + query.get("sort", "-published_at")
- ))
- return self._pagination(url)
-
class PatreonUserExtractor(PatreonExtractor):
"""Extractor for media from creators supported by you"""
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 18a3ceb..411d191 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -517,6 +517,7 @@ class PixivPixivisionExtractor(PixivExtractor):
directory_fmt = ("{category}", "pixivision",
"{pixivision_id} {pixivision_title}")
archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}"
+ cookies_domain = ".pixiv.net"
pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)"
example = "https://www.pixivision.net/en/a/12345"
@@ -549,6 +550,9 @@ class PixivSeriesExtractor(PixivExtractor):
directory_fmt = ("{category}", "{user[id]} {user[account]}",
"{series[id]} {series[title]}")
filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
+ cookies_domain = ".pixiv.net"
+ browser = "firefox"
+ tls12 = False
pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)"
example = "https://www.pixiv.net/user/12345/series/12345"
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index cd2ba3d..c0bf5b3 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -292,6 +292,29 @@ class RedditImageExtractor(Extractor):
yield Message.Url, url, data
+class RedditRedirectExtractor(Extractor):
+ """Extractor for personalized share URLs produced by the mobile app"""
+ category = "reddit"
+ subcategory = "redirect"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))"
+ r"/s/([a-zA-Z0-9]{10})")
+ example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.subreddit = match.group(1)
+ self.share_url = match.group(2)
+
+ def items(self):
+ url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \
+ self.share_url
+ data = {"_extractor": RedditSubmissionExtractor}
+ response = self.request(url, method="HEAD", allow_redirects=False,
+ notfound="submission")
+ yield Message.Queue, response.headers["Location"], data
+
+
class RedditAPI():
"""Interface for the Reddit API
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index dc35511..bebea2a 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -87,7 +87,7 @@ class SankakuTagExtractor(SankakuExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/?\?([^#]*)"
+ pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)"
example = "https://sankaku.app/?tags=TAG"
def __init__(self, match):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 61e871e..4766ae5 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -10,12 +10,13 @@
from .common import Extractor, Message
from .. import text, util, exception
-from ..cache import cache
+from ..cache import cache, memcache
import itertools
import json
import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:(?:[fv]x)?twitter|x)\.com"
+BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?"
+ r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com")
class TwitterExtractor(Extractor):
@@ -272,25 +273,23 @@ class TwitterExtractor(Extractor):
author = tweet["user"]
author = self._transform_user(author)
- if "note_tweet" in tweet:
- note = tweet["note_tweet"]["note_tweet_results"]["result"]
- else:
- note = None
-
- source = tweet["source"]
-
if "legacy" in tweet:
- tweet = tweet["legacy"]
+ legacy = tweet["legacy"]
+ else:
+ legacy = tweet
+ tget = legacy.get
- tweet_id = int(tweet["id_str"])
+ tweet_id = int(legacy["id_str"])
if tweet_id >= 300000000000000:
date = text.parse_timestamp(
((tweet_id >> 22) + 1288834974657) // 1000)
else:
- date = text.parse_datetime(
- tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ try:
+ date = text.parse_datetime(
+ legacy["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ except Exception:
+ date = util.NONE
- tget = tweet.get
tdata = {
"tweet_id" : tweet_id,
"retweet_id" : text.parse_int(
@@ -304,8 +303,8 @@ class TwitterExtractor(Extractor):
"date" : date,
"author" : author,
"user" : self._user or author,
- "lang" : tweet["lang"],
- "source" : text.extr(source, ">", "<"),
+ "lang" : legacy["lang"],
+ "source" : text.extr(tweet["source"], ">", "<"),
"sensitive" : tget("possibly_sensitive"),
"favorite_count": tget("favorite_count"),
"quote_count" : tget("quote_count"),
@@ -313,7 +312,13 @@ class TwitterExtractor(Extractor):
"retweet_count" : tget("retweet_count"),
}
- entities = note["entity_set"] if note else tweet["entities"]
+ if "note_tweet" in tweet:
+ note = tweet["note_tweet"]["note_tweet_results"]["result"]
+ content = note["text"]
+ entities = note["entity_set"]
+ else:
+ content = tget("full_text") or tget("text") or ""
+ entities = legacy["entities"]
hashtags = entities.get("hashtags")
if hashtags:
@@ -327,8 +332,7 @@ class TwitterExtractor(Extractor):
"nick": u["name"],
} for u in mentions]
- content = text.unescape(
- note["text"] if note else tget("full_text") or tget("text") or "")
+ content = text.unescape(content)
urls = entities.get("urls")
if urls:
for url in urls:
@@ -336,11 +340,13 @@ class TwitterExtractor(Extractor):
txt, _, tco = content.rpartition(" ")
tdata["content"] = txt if tco.startswith("https://t.co/") else content
- if "in_reply_to_screen_name" in tweet:
- tdata["reply_to"] = tweet["in_reply_to_screen_name"]
- if "quoted_by" in tweet:
- tdata["quote_by"] = tweet["quoted_by"]
+ if "in_reply_to_screen_name" in legacy:
+ tdata["reply_to"] = legacy["in_reply_to_screen_name"]
+ if "quoted_by" in legacy:
+ tdata["quote_by"] = legacy["quoted_by"]
if tdata["retweet_id"]:
+ tdata["content"] = "RT @{}: {}".format(
+ author["name"], tdata["content"])
tdata["date_original"] = text.parse_timestamp(
((tdata["retweet_id"] >> 22) + 1288834974657) // 1000)
@@ -1194,6 +1200,7 @@ class TwitterAPI():
}
return self._pagination_users(endpoint, variables)
+ @memcache(keyarg=1)
def user_by_rest_id(self, rest_id):
endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId"
features = self.features.copy()
@@ -1207,6 +1214,7 @@ class TwitterAPI():
}
return self._call(endpoint, params)["data"]["user"]["result"]
+ @memcache(keyarg=1)
def user_by_screen_name(self, screen_name):
endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName"
params = {
@@ -1527,15 +1535,21 @@ class TwitterAPI():
retweet["core"]["user_results"]["result"]
rtlegacy = retweet["legacy"]
+
+ if "note_tweet" in retweet:
+ tweet["note_tweet"] = retweet["note_tweet"]
+
if "extended_entities" in rtlegacy and \
"extended_entities" not in legacy:
legacy["extended_entities"] = \
rtlegacy["extended_entities"]
+
if "withheld_scope" in rtlegacy and \
"withheld_scope" not in legacy:
legacy["withheld_scope"] = \
rtlegacy["withheld_scope"]
- legacy["full_text"] = rtlegacy["full_text"]
+
+ legacy["full_text"] = rtlegacy["full_text"]
except KeyError:
pass
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 168d5a0..ed05e1f 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -191,7 +191,7 @@ class WeiboExtractor(Extractor):
headers = {"Referer": response.url}
data = {
"cb": "gen_callback",
- "fp": '{"os":"1","browser":"Gecko91,0,0,0","fonts":"undefined",'
+ "fp": '{"os":"1","browser":"Gecko109,0,0,0","fonts":"undefined",'
'"screenInfo":"1920*1080*24","plugins":""}',
}
@@ -203,8 +203,8 @@ class WeiboExtractor(Extractor):
params = {
"a" : "incarnate",
"t" : data["tid"],
- "w" : "2",
- "c" : "{:>03}".format(data["confidence"]),
+ "w" : "3" if data.get("new_tid") else "2",
+ "c" : "{:>03}".format(data.get("confidence") or 100),
"gc" : "",
"cb" : "cross_domain",
"from" : "weibo",