summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2024-10-25 17:27:30 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2024-10-25 17:27:30 -0400
commitfc004701f923bb954a22c7fec2ae8d607e78cb2b (patch)
treea5bea4ed6447ea43c099131430e3bd6182ee87d7 /gallery_dl/extractor
parent0db541f524e1774865efebcbe5653e9ad76ea2e8 (diff)
New upstream version 1.27.7.upstream/1.27.7
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/8chan.py53
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/behance.py1
-rw-r--r--gallery_dl/extractor/bluesky.py101
-rw-r--r--gallery_dl/extractor/bunkr.py143
-rw-r--r--gallery_dl/extractor/civitai.py115
-rw-r--r--gallery_dl/extractor/cohost.py2
-rw-r--r--gallery_dl/extractor/common.py39
-rw-r--r--gallery_dl/extractor/deviantart.py4
-rw-r--r--gallery_dl/extractor/exhentai.py6
-rw-r--r--gallery_dl/extractor/foolfuuka.py2
-rw-r--r--gallery_dl/extractor/lensdump.py109
-rw-r--r--gallery_dl/extractor/lolisafe.py10
-rw-r--r--gallery_dl/extractor/mangadex.py22
-rw-r--r--gallery_dl/extractor/mangakakalot.py6
-rw-r--r--gallery_dl/extractor/newgrounds.py60
-rw-r--r--gallery_dl/extractor/nozomi.py3
-rw-r--r--gallery_dl/extractor/patreon.py7
-rw-r--r--gallery_dl/extractor/pinterest.py171
-rw-r--r--gallery_dl/extractor/pixiv.py77
-rw-r--r--gallery_dl/extractor/postmill.py2
-rw-r--r--gallery_dl/extractor/reddit.py8
-rw-r--r--gallery_dl/extractor/scrolller.py227
-rw-r--r--gallery_dl/extractor/telegraph.py2
-rw-r--r--gallery_dl/extractor/tsumino.py6
-rw-r--r--gallery_dl/extractor/urlgalleries.py30
-rw-r--r--gallery_dl/extractor/vk.py9
-rw-r--r--gallery_dl/extractor/wikimedia.py5
28 files changed, 887 insertions, 334 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index f81d2a1..ce1c52a 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -27,12 +27,22 @@ class _8chanExtractor(Extractor):
Extractor.__init__(self, match)
def _init(self):
- now = util.datetime_utcnow()
- domain = self.root.rpartition("/")[2]
- self.cookies.set(
- now.strftime("TOS%Y%m%d"), "1", domain=domain)
- self.cookies.set(
- (now - timedelta(1)).strftime("TOS%Y%m%d"), "1", domain=domain)
+ tos = self.cookies_tos_name()
+ self.cookies.set(tos, "1", domain=self.root[8:])
+
+ @memcache()
+ def cookies_tos_name(self):
+ url = self.root + "/.static/pages/confirmed.html"
+ headers = {"Referer": self.root + "/.static/pages/disclaimer.html"}
+ response = self.request(url, headers=headers, allow_redirects=False)
+
+ for cookie in response.cookies:
+ if cookie.name.lower().startswith("tos"):
+ self.log.debug("TOS cookie name: %s", cookie.name)
+ return cookie.name
+
+ self.log.error("Unable to determin TOS cookie name")
+ return "TOS20241009"
@memcache()
def cookies_prepare(self):
@@ -64,16 +74,14 @@ class _8chanThreadExtractor(_8chanExtractor):
"{threadId} {subject[:50]}")
filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}"
archive_fmt = "{boardUri}_{postId}_{num}"
- pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)"
example = "https://8chan.moe/a/res/12345.html"
- def __init__(self, match):
- _8chanExtractor.__init__(self, match)
- _, self.board, self.thread = match.groups()
-
def items(self):
+ _, board, thread = self.groups
+
# fetch thread data
- url = "{}/{}/res/{}.".format(self.root, self.board, self.thread)
+ url = "{}/{}/res/{}.".format(self.root, board, thread)
self.session.headers["Referer"] = url + "html"
thread = self.request(url + "json").json()
thread["postId"] = thread["threadId"]
@@ -106,25 +114,22 @@ class _8chanBoardExtractor(_8chanExtractor):
pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$"
example = "https://8chan.moe/a/"
- def __init__(self, match):
- _8chanExtractor.__init__(self, match)
- _, self.board, self.page = match.groups()
-
def items(self):
- page = text.parse_int(self.page, 1)
- url = "{}/{}/{}.json".format(self.root, self.board, page)
- board = self.request(url).json()
- threads = board["threads"]
+ _, board, pnum = self.groups
+ pnum = text.parse_int(pnum, 1)
+ url = "{}/{}/{}.json".format(self.root, board, pnum)
+ data = self.request(url).json()
+ threads = data["threads"]
while True:
for thread in threads:
thread["_extractor"] = _8chanThreadExtractor
url = "{}/{}/res/{}.html".format(
- self.root, self.board, thread["threadId"])
+ self.root, board, thread["threadId"])
yield Message.Queue, url, thread
- page += 1
- if page > board["pageCount"]:
+ pnum += 1
+ if pnum > data["pageCount"]:
return
- url = "{}/{}/{}.json".format(self.root, self.board, page)
+ url = "{}/{}/{}.json".format(self.root, board, pnum)
threads = self.request(url).json()["threads"]
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 9885195..4e9fa50 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -141,6 +141,7 @@ modules = [
"rule34us",
"sankaku",
"sankakucomplex",
+ "scrolller",
"seiga",
"senmanga",
"sexcom",
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 72f9195..14598b7 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -171,6 +171,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
url = text.extr(page, '<source src="', '"')
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
+ module["_ytdl_manifest"] = "hls"
module["extension"] = "mp4"
append((url, module))
continue
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index 39c5635..a1a488e 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -42,62 +42,76 @@ class BlueskyExtractor(Extractor):
self._user = self._user_did = None
self.instance = self.root.partition("://")[2]
self.videos = self.config("videos", True)
+ self.quoted = self.config("quoted", False)
def items(self):
for post in self.posts():
if "post" in post:
post = post["post"]
-
- pid = post["uri"].rpartition("/")[2]
if self._user_did and post["author"]["did"] != self._user_did:
- self.log.debug("Skipping %s (repost)", pid)
- continue
-
- post.update(post["record"])
- del post["record"]
-
- if self._metadata_facets:
- if "facets" in post:
- post["hashtags"] = tags = []
- post["mentions"] = dids = []
- post["uris"] = uris = []
- for facet in post["facets"]:
- features = facet["features"][0]
- if "tag" in features:
- tags.append(features["tag"])
- elif "did" in features:
- dids.append(features["did"])
- elif "uri" in features:
- uris.append(features["uri"])
- else:
- post["hashtags"] = post["mentions"] = post["uris"] = ()
-
- if self._metadata_user:
- post["user"] = self._user or post["author"]
-
- files = self._extract_files(post)
- post["instance"] = self.instance
- post["post_id"] = pid
- post["count"] = len(files)
- post["date"] = text.parse_datetime(
- post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
-
- yield Message.Directory, post
-
- if not files:
+ self.log.debug("Skipping %s (repost)", self._pid(post))
continue
-
- base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
- "?did={}&cid=".format(post["author"]["did"]))
- for post["num"], file in enumerate(files, 1):
- post.update(file)
- yield Message.Url, base + file["filename"], post
+ embed = post.get("embed")
+ post.update(post.pop("record"))
+
+ while True:
+ self._prepare(post)
+ files = self._extract_files(post)
+
+ yield Message.Directory, post
+ if files:
+ base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
+ "?did={}&cid=".format(post["author"]["did"]))
+ for post["num"], file in enumerate(files, 1):
+ post.update(file)
+ yield Message.Url, base + file["filename"], post
+
+ if not self.quoted or not embed or "record" not in embed:
+ break
+
+ quote = embed["record"]
+ if "record" in quote:
+ quote = quote["record"]
+ quote["quote_id"] = self._pid(post)
+ quote["quote_by"] = post["author"]
+ embed = quote.get("embed")
+ quote.update(quote.pop("value"))
+ post = quote
def posts(self):
return ()
+ def _pid(self, post):
+ return post["uri"].rpartition("/")[2]
+
+ def _prepare(self, post):
+ if self._metadata_facets:
+ if "facets" in post:
+ post["hashtags"] = tags = []
+ post["mentions"] = dids = []
+ post["uris"] = uris = []
+ for facet in post["facets"]:
+ features = facet["features"][0]
+ if "tag" in features:
+ tags.append(features["tag"])
+ elif "did" in features:
+ dids.append(features["did"])
+ elif "uri" in features:
+ uris.append(features["uri"])
+ else:
+ post["hashtags"] = post["mentions"] = post["uris"] = ()
+
+ if self._metadata_user:
+ post["user"] = self._user or post["author"]
+
+ post["instance"] = self.instance
+ post["post_id"] = self._pid(post)
+ post["date"] = text.parse_datetime(
+ post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
+
def _extract_files(self, post):
if "embed" not in post:
+ post["count"] = 0
return ()
files = []
@@ -111,6 +125,7 @@ class BlueskyExtractor(Extractor):
if "video" in media and self.videos:
files.append(self._extract_media(media, "video"))
+ post["count"] = len(files)
return files
def _extract_media(self, media, key):
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 9022ffc..6c79d0a 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -8,9 +8,10 @@
"""Extractors for https://bunkr.si/"""
+from .common import Extractor
from .lolisafe import LolisafeAlbumExtractor
-from .. import text, config
-
+from .. import text, config, exception
+import random
if config.get(("extractor", "bunkr"), "tlds"):
BASE_PATTERN = (
@@ -21,11 +22,28 @@ else:
BASE_PATTERN = (
r"(?:bunkr:(?:https?://)?([^/?#]+)|"
r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]"
+ r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]"
r"|black|cat|media|red|site|ws|org)))"
)
+DOMAINS = [
+ "bunkr.ac",
+ "bunkr.ci",
+ "bunkr.fi",
+ "bunkr.ph",
+ "bunkr.pk",
+ "bunkr.ps",
+ "bunkr.si",
+ "bunkr.sk",
+ "bunkr.ws",
+ "bunkr.black",
+ "bunkr.red",
+ "bunkr.media",
+ "bunkr.site",
+]
LEGACY_DOMAINS = {
+ "bunkr.ax",
+ "bunkr.cat",
"bunkr.ru",
"bunkrr.ru",
"bunkr.su",
@@ -34,6 +52,7 @@ LEGACY_DOMAINS = {
"bunkr.is",
"bunkr.to",
}
+CF_DOMAINS = set()
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
@@ -49,45 +68,96 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
if domain not in LEGACY_DOMAINS:
self.root = "https://" + domain
+ def request(self, url, **kwargs):
+ kwargs["allow_redirects"] = False
+
+ while True:
+ try:
+ response = Extractor.request(self, url, **kwargs)
+ if response.status_code < 300:
+ return response
+
+ # redirect
+ url = response.headers["Location"]
+ root, path = self._split(url)
+ if root not in CF_DOMAINS:
+ continue
+ self.log.debug("Redirect to known CF challenge domain '%s'",
+ root)
+
+ except exception.HttpError as exc:
+ if exc.status != 403:
+ raise
+
+ # CF challenge
+ root, path = self._split(url)
+ CF_DOMAINS.add(root)
+ self.log.debug("Added '%s' to CF challenge domains", root)
+
+ try:
+ DOMAINS.remove(root.rpartition("/")[2])
+ except ValueError:
+ pass
+ else:
+ if not DOMAINS:
+ raise exception.StopExtraction(
+ "All Bunkr domains require solving a CF challenge")
+
+ # select alternative domain
+ root = "https://" + random.choice(DOMAINS)
+ self.log.debug("Trying '%s' as fallback", root)
+ url = root + path
+
def fetch_album(self, album_id):
# album metadata
page = self.request(self.root + "/a/" + self.album_id).text
- info = text.split_html(text.extr(
- page, "<h1", "</div>").partition(">")[2])
- count, _, size = info[1].split(None, 2)
+ title, size = text.split_html(text.extr(
+ page, "<h1", "</span>").partition(">")[2])
- pos = page.index('class="grid-images')
- urls = list(text.extract_iter(page, '<a href="', '"', pos))
-
- return self._extract_files(urls), {
+ items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->"))
+ return self._extract_files(items), {
"album_id" : self.album_id,
- "album_name" : text.unescape(info[0]),
- "album_size" : size[1:-1],
- "count" : len(urls),
- "_http_validate": self._validate,
+ "album_name" : title,
+ "album_size" : text.extr(size, "(", ")"),
+ "count" : len(items),
}
- def _extract_files(self, urls):
- for url in urls:
+ def _extract_files(self, items):
+ for item in items:
try:
- url = self._extract_file(text.unescape(url))
+ url = text.extr(item, ' href="', '"')
+ file = self._extract_file(text.unescape(url))
+
+ info = text.split_html(item)
+ file["name"] = info[0]
+ file["size"] = info[2]
+ file["date"] = text.parse_datetime(
+ info[-1], "%H:%M:%S %d/%m/%Y")
+
+ yield file
+ except exception.StopExtraction:
+ raise
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
- continue
- yield {"file": text.unescape(url)}
-
- def _extract_file(self, url):
- page = self.request(url).text
- url = (text.extr(page, '<source src="', '"') or
- text.extr(page, '<img src="', '"'))
-
- if not url:
- url_download = text.rextract(
- page, ' href="', '"', page.rindex("Download"))[0]
- page = self.request(text.unescape(url_download)).text
- url = text.unescape(text.rextract(page, ' href="', '"')[0])
-
- return url
+ self.log.debug("", exc_info=exc)
+
+ def _extract_file(self, webpage_url):
+ response = self.request(webpage_url)
+ page = response.text
+ file_url = (text.extr(page, '<source src="', '"') or
+ text.extr(page, '<img src="', '"'))
+
+ if not file_url:
+ webpage_url = text.unescape(text.rextract(
+ page, ' href="', '"', page.rindex("Download"))[0])
+ response = self.request(webpage_url)
+ file_url = text.rextract(response.text, ' href="', '"')[0]
+
+ return {
+ "file" : text.unescape(file_url),
+ "_http_headers" : {"Referer": response.url},
+ "_http_validate": self._validate,
+ }
def _validate(self, response):
if response.history and response.url.endswith("/maintenance-vid.mp4"):
@@ -95,6 +165,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
return False
return True
+ def _split(self, url):
+ pos = url.index("/", 8)
+ return url[:pos], url[pos:]
+
class BunkrMediaExtractor(BunkrAlbumExtractor):
"""Extractor for bunkr.si media links"""
@@ -105,16 +179,15 @@ class BunkrMediaExtractor(BunkrAlbumExtractor):
def fetch_album(self, album_id):
try:
- url = self._extract_file(self.root + self.album_id)
+ file = self._extract_file(self.root + album_id)
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
- return ({"file": text.unescape(url)},), {
+ return (file,), {
"album_id" : "",
"album_name" : "",
"album_size" : -1,
"description": "",
"count" : 1,
- "_http_validate": self._validate,
}
diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py
index 725af3a..0b1e44a 100644
--- a/gallery_dl/extractor/civitai.py
+++ b/gallery_dl/extractor/civitai.py
@@ -9,7 +9,7 @@
"""Extractors for https://www.civitai.com/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text, util, exception
import itertools
import time
@@ -23,7 +23,7 @@ class CivitaiExtractor(Extractor):
root = "https://civitai.com"
directory_fmt = ("{category}", "{username|user[username]}", "images")
filename_fmt = "{file[id]|id|filename}.{extension}"
- archive_fmt = "{file[hash]|hash}"
+ archive_fmt = "{file[uuid]|uuid}"
request_interval = (0.5, 1.5)
def _init(self):
@@ -101,9 +101,11 @@ class CivitaiExtractor(Extractor):
def _url(self, image):
url = image["url"]
if "/" in url:
- parts = url.rsplit("/", 2)
- parts[1] = self._image_quality
+ parts = url.rsplit("/", 3)
+ image["uuid"] = parts[1]
+ parts[2] = self._image_quality
return "/".join(parts)
+ image["uuid"] = url
name = image.get("name")
if not name:
@@ -133,8 +135,6 @@ class CivitaiModelExtractor(CivitaiExtractor):
directory_fmt = ("{category}", "{user[username]}",
"{model[id]}{model[name]:? //}",
"{version[id]}{version[name]:? //}")
- filename_fmt = "{file[id]}.{extension}"
- archive_fmt = "{file[hash]}"
pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?"
example = "https://civitai.com/models/12345/TITLE"
@@ -195,19 +195,25 @@ class CivitaiModelExtractor(CivitaiExtractor):
)
def _extract_files_model(self, model, version, user):
- return [
- {
+ files = []
+
+ for num, file in enumerate(version["files"], 1):
+ file["uuid"] = "model-{}-{}-{}".format(
+ model["id"], version["id"], file["id"])
+ files.append({
"num" : num,
"file" : file,
"filename" : file["name"],
"extension": "bin",
- "url" : file["downloadUrl"],
+ "url" : file.get("downloadUrl") or
+ "{}/api/download/models/{}".format(
+ self.root, version["id"]),
"_http_headers" : {
"Authorization": self.api.headers.get("Authorization")},
"_http_validate": self._validate_file_model,
- }
- for num, file in enumerate(version["files"], 1)
- ]
+ })
+
+ return files
def _extract_files_image(self, model, version, user):
if "images" in version:
@@ -263,24 +269,14 @@ class CivitaiPostExtractor(CivitaiExtractor):
return ({"id": int(self.groups[0])},)
-class CivitaiTagModelsExtractor(CivitaiExtractor):
- subcategory = "tag-models"
- pattern = BASE_PATTERN + r"/(?:tag/|models\?tag=)([^/?&#]+)"
+class CivitaiTagExtractor(CivitaiExtractor):
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"/tag/([^/?&#]+)"
example = "https://civitai.com/tag/TAG"
def models(self):
tag = text.unquote(self.groups[0])
- return self.api.models({"tag": tag})
-
-
-class CivitaiTagImagesExtractor(CivitaiExtractor):
- subcategory = "tag-images"
- pattern = BASE_PATTERN + r"/images\?tags=([^&#]+)"
- example = "https://civitai.com/images?tags=12345"
-
- def images(self):
- tag = text.unquote(self.groups[0])
- return self.api.images({"tag": tag})
+ return self.api.models_tag(tag)
class CivitaiSearchExtractor(CivitaiExtractor):
@@ -293,6 +289,26 @@ class CivitaiSearchExtractor(CivitaiExtractor):
return self.api.models(params)
+class CivitaiModelsExtractor(CivitaiExtractor):
+ subcategory = "models"
+ pattern = BASE_PATTERN + r"/models(?:/?\?([^#]+))?(?:$|#)"
+ example = "https://civitai.com/models"
+
+ def models(self):
+ params = text.parse_query(self.groups[0])
+ return self.api.models(params)
+
+
+class CivitaiImagesExtractor(CivitaiExtractor):
+ subcategory = "images"
+ pattern = BASE_PATTERN + r"/images(?:/?\?([^#]+))?(?:$|#)"
+ example = "https://civitai.com/images"
+
+ def images(self):
+ params = text.parse_query(self.groups[0])
+ return self.api.images(params)
+
+
class CivitaiUserExtractor(CivitaiExtractor):
subcategory = "user"
pattern = USER_PATTERN + r"/?(?:$|\?|#)"
@@ -339,11 +355,35 @@ class CivitaiUserImagesExtractor(CivitaiExtractor):
pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?"
example = "https://civitai.com/user/USER/images"
+ def __init__(self, match):
+ self.params = text.parse_query_list(match.group(2))
+ if self.params.get("section") == "reactions":
+ self.subcategory = "reactions"
+ self.images = self.images_reactions
+ CivitaiExtractor.__init__(self, match)
+
def images(self):
- params = text.parse_query(self.groups[1])
+ params = self.params
params["username"] = text.unquote(self.groups[0])
return self.api.images(params)
+ def images_reactions(self):
+ if "Authorization" not in self.api.headers and \
+ not self.cookies.get(
+ "__Secure-civitai-token", domain=".civitai.com"):
+ raise exception.AuthorizationError("api-key or cookies required")
+
+ params = self.params
+ params["authed"] = True
+ params["useIndex"] = False
+ if "reactions" in params:
+ if isinstance(params["reactions"], str):
+ params["reactions"] = (params["reactions"],)
+ else:
+ params["reactions"] = (
+ "Like", "Dislike", "Heart", "Laugh", "Cry")
+ return self.api.images(params)
+
class CivitaiRestAPI():
"""Interface for the Civitai Public REST API
@@ -396,6 +436,9 @@ class CivitaiRestAPI():
def models(self, params):
return self._pagination("/v1/models", params)
+ def models_tag(self, tag):
+ return self.models({"tag": tag})
+
def _call(self, endpoint, params=None):
if endpoint[0] == "/":
url = self.root + endpoint
@@ -419,14 +462,14 @@ class CivitaiRestAPI():
class CivitaiTrpcAPI():
- """Interface for the Civitai TRPC API"""
+ """Interface for the Civitai tRPC API"""
def __init__(self, extractor):
self.extractor = extractor
self.root = extractor.root + "/api/trpc/"
self.headers = {
"content-type" : "application/json",
- "x-client-version": "5.0.146",
+ "x-client-version": "5.0.185",
"x-client-date" : "",
"x-client" : "web",
"x-fingerprint" : "undefined",
@@ -463,6 +506,7 @@ class CivitaiTrpcAPI():
"include" : ["cosmetics"],
})
+ params = self._type_params(params)
return self._pagination(endpoint, params)
def images_gallery(self, model, version, user):
@@ -516,6 +560,9 @@ class CivitaiTrpcAPI():
return self._pagination(endpoint, params)
+ def models_tag(self, tag):
+ return self.models({"tagname": tag})
+
def post(self, post_id):
endpoint = "post.get"
params = {"id": int(post_id)}
@@ -580,3 +627,13 @@ class CivitaiTrpcAPI():
def _merge_params(self, params_user, params_default):
params_default.update(params_user)
return params_default
+
+ def _type_params(self, params):
+ for key, type in (
+ ("tags" , int),
+ ("modelId" , int),
+ ("modelVersionId", int),
+ ):
+ if key in params:
+ params[key] = type(params[key])
+ return params
diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py
index 4722a4f..0524239 100644
--- a/gallery_dl/extractor/cohost.py
+++ b/gallery_dl/extractor/cohost.py
@@ -109,7 +109,7 @@ class CohostUserExtractor(CohostExtractor):
"projectHandle": self.groups[0],
"page": 0,
"options": {
- "pinnedPostsAtTop" : bool(self.pinned),
+ "pinnedPostsAtTop" : True if self.pinned else False,
"hideReplies" : not self.replies,
"hideShares" : not self.shares,
"hideAsks" : not self.asks,
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 32c8e67..2146fa6 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -185,7 +185,9 @@ class Extractor():
self._dump_response(response)
if (
code < 400 or
- code < 500 and (not fatal and code != 429 or fatal is None)
+ code < 500 and (
+ not fatal and code != 429 or fatal is None) or
+ fatal is ...
):
if encoding:
response.encoding = encoding
@@ -454,46 +456,49 @@ class Extractor():
cookies = random.choice(cookies)
self.cookies_load(cookies)
- def cookies_load(self, cookies):
- if isinstance(cookies, dict):
- self.cookies_update_dict(cookies, self.cookies_domain)
+ def cookies_load(self, cookies_source):
+ if isinstance(cookies_source, dict):
+ self.cookies_update_dict(cookies_source, self.cookies_domain)
- elif isinstance(cookies, str):
- path = util.expand_path(cookies)
+ elif isinstance(cookies_source, str):
+ path = util.expand_path(cookies_source)
try:
with open(path) as fp:
- util.cookiestxt_load(fp, self.cookies)
+ cookies = util.cookiestxt_load(fp)
except Exception as exc:
self.log.warning("cookies: %s", exc)
else:
- self.log.debug("Loading cookies from '%s'", cookies)
+ self.log.debug("Loading cookies from '%s'", cookies_source)
+ set_cookie = self.cookies.set_cookie
+ for cookie in cookies:
+ set_cookie(cookie)
self.cookies_file = path
- elif isinstance(cookies, (list, tuple)):
- key = tuple(cookies)
- cookiejar = _browser_cookies.get(key)
+ elif isinstance(cookies_source, (list, tuple)):
+ key = tuple(cookies_source)
+ cookies = _browser_cookies.get(key)
- if cookiejar is None:
+ if cookies is None:
from ..cookies import load_cookies
- cookiejar = self.cookies.__class__()
try:
- load_cookies(cookiejar, cookies)
+ cookies = load_cookies(cookies_source)
except Exception as exc:
self.log.warning("cookies: %s", exc)
+ cookies = ()
else:
- _browser_cookies[key] = cookiejar
+ _browser_cookies[key] = cookies
else:
self.log.debug("Using cached cookies from %s", key)
set_cookie = self.cookies.set_cookie
- for cookie in cookiejar:
+ for cookie in cookies:
set_cookie(cookie)
else:
self.log.warning(
"Expected 'dict', 'list', or 'str' value for 'cookies' "
"option, got '%s' (%s)",
- cookies.__class__.__name__, cookies)
+ cookies_source.__class__.__name__, cookies_source)
def cookies_store(self):
"""Store the session's cookies in a cookies.txt file"""
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 836fae7..693def9 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -401,7 +401,7 @@ class DeviantartExtractor(Extractor):
html = content["html"]
markup = html["markup"]
- if not markup.startswith("{"):
+ if not markup or markup[0] != "{":
return markup
if html["type"] == "tiptap":
@@ -1301,7 +1301,7 @@ class DeviantartOAuthAPI():
metadata = extractor.config("metadata", False)
if not metadata:
- metadata = bool(extractor.extra)
+ metadata = True if extractor.extra else False
if metadata:
self.metadata = True
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 01af7a4..3e6d537 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -260,9 +260,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"torrentcount" : extr('>Torrent Download (', ')'),
}
- if data["uploader"].startswith("<"):
- data["uploader"] = text.unescape(text.extr(
- data["uploader"], ">", "<"))
+ uploader = data["uploader"]
+ if uploader and uploader[0] == "<":
+ data["uploader"] = text.unescape(text.extr(uploader, ">", "<"))
f = data["favorites"][0]
if f == "N":
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 85dd896..44c4542 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -37,7 +37,7 @@ class FoolfuukaExtractor(BaseExtractor):
if not url and "remote_media_link" in media:
url = self.remote(media)
- if url.startswith("/"):
+ if url and url[0] == "/":
url = self.root + url
post["filename"], _, post["extension"] = \
diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py
index 12e8860..72a6453 100644
--- a/gallery_dl/extractor/lensdump.py
+++ b/gallery_dl/extractor/lensdump.py
@@ -17,42 +17,30 @@ class LensdumpBase():
category = "lensdump"
root = "https://lensdump.com"
- def nodes(self, page=None):
- if page is None:
- page = self.request(self.url).text
-
- # go through all pages starting from the oldest
- page_url = text.urljoin(self.root, text.extr(
- text.extr(page, ' id="list-most-oldest-link"', '>'),
- 'href="', '"'))
- while page_url is not None:
- if page_url == self.url:
- current_page = page
- else:
- current_page = self.request(page_url).text
-
- for node in text.extract_iter(
- current_page, ' class="list-item ', '>'):
- yield node
-
- # find url of next page
- page_url = text.extr(
- text.extr(current_page, ' data-pagination="next"', '>'),
- 'href="', '"')
- if page_url is not None and len(page_url) > 0:
- page_url = text.urljoin(self.root, page_url)
- else:
- page_url = None
+ def _pagination(self, page, begin, end):
+ while True:
+ yield from text.extract_iter(page, begin, end)
+
+ next = text.extr(page, ' data-pagination="next"', '>')
+ if not next:
+ return
+
+ url = text.urljoin(self.root, text.extr(next, 'href="', '"'))
+ page = self.request(url).text
class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
subcategory = "album"
- pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))"
+ pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?"
example = "https://lensdump.com/a/ID"
def __init__(self, match):
- GalleryExtractor.__init__(self, match, match.string)
- self.gallery_id = match.group(1) or match.group(2)
+ self.gallery_id, query = match.groups()
+ if query:
+ url = "{}/a/{}/?{}".format(self.root, self.gallery_id, query)
+ else:
+ url = "{}/a/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
return {
@@ -62,40 +50,48 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
}
def images(self, page):
- for node in self.nodes(page):
- # get urls and filenames of images in current page
- json_data = util.json_loads(text.unquote(
- text.extr(node, "data-object='", "'") or
- text.extr(node, 'data-object="', '"')))
- image_id = json_data.get('name')
- image_url = json_data.get('url')
- image_title = json_data.get('title')
+ for image in self._pagination(page, ' class="list-item ', '>'):
+
+ data = util.json_loads(text.unquote(
+ text.extr(image, "data-object='", "'") or
+ text.extr(image, 'data-object="', '"')))
+ image_id = data.get("name")
+ image_url = data.get("url")
+ image_title = data.get("title")
if image_title is not None:
image_title = text.unescape(image_title)
+
yield (image_url, {
- 'id': image_id,
- 'url': image_url,
- 'title': image_title,
- 'name': json_data.get('filename'),
- 'filename': image_id,
- 'extension': json_data.get('extension'),
- 'height': text.parse_int(json_data.get('height')),
- 'width': text.parse_int(json_data.get('width')),
+ "id" : image_id,
+ "url" : image_url,
+ "title" : image_title,
+ "name" : data.get("filename"),
+ "filename" : image_id,
+ "extension": data.get("extension"),
+ "width" : text.parse_int(data.get("width")),
+ "height" : text.parse_int(data.get("height")),
})
class LensdumpAlbumsExtractor(LensdumpBase, Extractor):
"""Extractor for album list from lensdump.com"""
subcategory = "albums"
- pattern = BASE_PATTERN + r"/\w+/albums"
- example = "https://lensdump.com/USER/albums"
+ pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?"
+ example = "https://lensdump.com/USER"
def items(self):
- for node in self.nodes():
- album_url = text.urljoin(self.root, text.extr(
- node, 'data-url-short="', '"'))
- yield Message.Queue, album_url, {
- "_extractor": LensdumpAlbumExtractor}
+ user, query = self.groups
+ url = "{}/{}/".format(self.root, user)
+ if query:
+ params = text.parse_query(query)
+ else:
+ params = {"sort": "date_asc", "page": "1"}
+ page = self.request(url, params=params).text
+
+ data = {"_extractor": LensdumpAlbumExtractor}
+ for album_path in self._pagination(page, 'data-url-short="', '"'):
+ album_url = text.urljoin(self.root, album_path)
+ yield Message.Queue, album_url, data
class LensdumpImageExtractor(LensdumpBase, Extractor):
@@ -107,16 +103,13 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)"
example = "https://lensdump.com/i/ID"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.key = match.group(1)
-
def items(self):
- url = "{}/i/{}".format(self.root, self.key)
+ key = self.groups[0]
+ url = "{}/i/{}".format(self.root, key)
extr = text.extract_from(self.request(url).text)
data = {
- "id" : self.key,
+ "id" : key,
"title" : text.unescape(extr(
'property="og:title" content="', '"')),
"url" : extr(
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 6fc0689..044f4f5 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -47,7 +47,15 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
url = file["file"]
file.update(data)
text.nameext_from_url(url, file)
- file["name"], sep, file["id"] = file["filename"].rpartition("-")
+
+ if "name" in file:
+ name = file["name"]
+ file["name"] = name.rpartition(".")[0] or name
+ file["id"] = file["filename"].rpartition("-")[2]
+ else:
+ file["name"], sep, file["id"] = \
+ file["filename"].rpartition("-")
+
yield Message.Url, url, file
def fetch_album(self, album_id):
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index bca7e4d..1f24593 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -174,6 +174,20 @@ class MangadexListExtractor(MangadexExtractor):
yield Message.Queue, url, data
+class MangadexAuthorExtractor(MangadexExtractor):
+ """Extractor for mangadex authors"""
+ subcategory = "author"
+ pattern = BASE_PATTERN + r"/author/([0-9a-f-]+)"
+ example = ("https://mangadex.org/author"
+ "/01234567-89ab-cdef-0123-456789abcdef/NAME")
+
+ def items(self):
+ for manga in self.api.manga_author(self.uuid):
+ manga["_extractor"] = MangadexMangaExtractor
+ url = "{}/title/{}".format(self.root, manga["id"])
+ yield Message.Queue, url, manga
+
+
class MangadexAPI():
"""Interface for the MangaDex API v5
@@ -195,6 +209,10 @@ class MangadexAPI():
def athome_server(self, uuid):
return self._call("/at-home/server/" + uuid)
+ def author(self, uuid, manga=False):
+ params = {"includes[]": ("manga",)} if manga else None
+ return self._call("/author/" + uuid, params)["data"]
+
def chapter(self, uuid):
params = {"includes[]": ("scanlation_group",)}
return self._call("/chapter/" + uuid, params)["data"]
@@ -210,6 +228,10 @@ class MangadexAPI():
params = {"includes[]": ("artist", "author")}
return self._call("/manga/" + uuid, params)["data"]
+ def manga_author(self, uuid_author):
+ params = {"authorOrArtist": uuid_author}
+ return self._pagination("/manga", params)
+
def manga_feed(self, uuid):
order = "desc" if self.extractor.config("chapter-reverse") else "asc"
params = {
diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py
index 0183b25..9fc8681 100644
--- a/gallery_dl/extractor/mangakakalot.py
+++ b/gallery_dl/extractor/mangakakalot.py
@@ -19,7 +19,7 @@ BASE_PATTERN = r"(?:https?://)?(?:ww[\dw]?\.)?mangakakalot\.tv"
class MangakakalotBase():
"""Base class for mangakakalot extractors"""
category = "mangakakalot"
- root = "https://ww6.mangakakalot.tv"
+ root = "https://ww8.mangakakalot.tv"
class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor):
@@ -40,7 +40,7 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor):
match = re.match(
r"(?:[Vv]ol\. *(\d+) )?"
r"[Cc]hapter *([^:]*)"
- r"(?:: *(.+))?", info)
+ r"(?:: *(.+))?", info or "")
volume, chapter, title = match.groups() if match else ("", "", info)
chapter, sep, minor = chapter.partition(".")
@@ -86,7 +86,7 @@ class MangakakalotMangaExtractor(MangakakalotBase, MangaExtractor):
data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = sep + minor
- if url.startswith("/"):
+ if url[0] == "/":
url = self.root + url
results.append((url, data.copy()))
return results
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 2928573..61ffdee 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -14,6 +14,9 @@ from ..cache import cache
import itertools
import re
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com"
+USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com"
+
class NewgroundsExtractor(Extractor):
"""Base class for newgrounds extractors"""
@@ -93,7 +96,7 @@ class NewgroundsExtractor(Extractor):
def posts(self):
"""Return URLs of all relevant post pages"""
- return self._pagination(self._path)
+ return self._pagination(self._path, self.groups[1])
def metadata(self):
"""Return general metadata"""
@@ -334,10 +337,10 @@ class NewgroundsExtractor(Extractor):
for fmt in formats:
yield fmt[1][0]["src"]
- def _pagination(self, kind):
+ def _pagination(self, kind, pnum=1):
url = "{}/{}".format(self.user_root, kind)
params = {
- "page": 1,
+ "page": text.parse_int(pnum, 1),
"isAjaxRequest": "1",
}
headers = {
@@ -400,8 +403,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
class NewgroundsMediaExtractor(NewgroundsExtractor):
"""Extractor for a media file from newgrounds.com"""
subcategory = "media"
- pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com"
- r"(/(?:portal/view|audio/listen)/\d+)")
+ pattern = BASE_PATTERN + r"(/(?:portal/view|audio/listen)/\d+)"
example = "https://www.newgrounds.com/portal/view/12345"
def __init__(self, match):
@@ -416,35 +418,35 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
class NewgroundsArtExtractor(NewgroundsExtractor):
"""Extractor for all images of a newgrounds user"""
subcategory = _path = "art"
- pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/art/?$"
+ pattern = USER_PATTERN + r"/art(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/art"
class NewgroundsAudioExtractor(NewgroundsExtractor):
"""Extractor for all audio submissions of a newgrounds user"""
subcategory = _path = "audio"
- pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/audio/?$"
+ pattern = USER_PATTERN + r"/audio(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/audio"
class NewgroundsMoviesExtractor(NewgroundsExtractor):
"""Extractor for all movies of a newgrounds user"""
subcategory = _path = "movies"
- pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/movies/?$"
+ pattern = USER_PATTERN + r"/movies(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/movies"
class NewgroundsGamesExtractor(NewgroundsExtractor):
"""Extractor for a newgrounds user's games"""
subcategory = _path = "games"
- pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$"
+ pattern = USER_PATTERN + r"/games(?:(?:/page/|/?\?page=)(\d+))?/?$"
example = "https://USER.newgrounds.com/games"
class NewgroundsUserExtractor(NewgroundsExtractor):
"""Extractor for a newgrounds user profile"""
subcategory = "user"
- pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/?$"
+ pattern = USER_PATTERN + r"/?$"
example = "https://USER.newgrounds.com"
def initialize(self):
@@ -464,25 +466,22 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
"""Extractor for posts favorited by a newgrounds user"""
subcategory = "favorite"
directory_fmt = ("{category}", "{user}", "Favorites")
- pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com"
- r"/favorites(?!/following)(?:/(art|audio|movies))?/?")
+ pattern = (USER_PATTERN + r"/favorites(?!/following)(?:/(art|audio|movies)"
+ r"(?:(?:/page/|/?\?page=)(\d+))?)?")
example = "https://USER.newgrounds.com/favorites"
- def __init__(self, match):
- NewgroundsExtractor.__init__(self, match)
- self.kind = match.group(2)
-
def posts(self):
- if self.kind:
- return self._pagination(self.kind)
+ _, kind, pnum = self.groups
+ if kind:
+ return self._pagination_favorites(kind, pnum)
return itertools.chain.from_iterable(
- self._pagination(k) for k in ("art", "audio", "movies")
+ self._pagination_favorites(k) for k in ("art", "audio", "movies")
)
- def _pagination(self, kind):
+ def _pagination_favorites(self, kind, pnum=1):
url = "{}/favorites/{}".format(self.user_root, kind)
params = {
- "page": 1,
+ "page": text.parse_int(pnum, 1),
"isAjaxRequest": "1",
}
headers = {
@@ -514,12 +513,13 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
"""Extractor for a newgrounds user's favorited users"""
subcategory = "following"
- pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)"
+ pattern = USER_PATTERN + r"/favorites/(following)"
example = "https://USER.newgrounds.com/favorites/following"
def items(self):
+ _, kind, pnum = self.groups
data = {"_extractor": NewgroundsUserExtractor}
- for url in self._pagination(self.kind):
+ for url in self._pagination_favorites(kind, pnum):
yield Message.Queue, url, data
@staticmethod
@@ -534,13 +534,12 @@ class NewgroundsSearchExtractor(NewgroundsExtractor):
"""Extractor for newgrounds.com search reesults"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search_tags}")
- pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com"
- r"/search/conduct/([^/?#]+)/?\?([^#]+)")
+ pattern = BASE_PATTERN + r"/search/conduct/([^/?#]+)/?\?([^#]+)"
example = "https://www.newgrounds.com/search/conduct/art?terms=QUERY"
def __init__(self, match):
NewgroundsExtractor.__init__(self, match)
- self._path, query = match.groups()
+ self._path, query = self.groups
self.query = text.parse_query(query)
def posts(self):
@@ -550,19 +549,20 @@ class NewgroundsSearchExtractor(NewgroundsExtractor):
for s in suitabilities.split(",")}
self.request(self.root + "/suitabilities",
method="POST", data=data)
- return self._pagination("/search/conduct/" + self._path, self.query)
+ return self._pagination_search(
+ "/search/conduct/" + self._path, self.query)
def metadata(self):
return {"search_tags": self.query.get("terms", "")}
- def _pagination(self, path, params):
+ def _pagination_search(self, path, params):
url = self.root + path
+ params["inner"] = "1"
+ params["page"] = text.parse_int(params.get("page"), 1)
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
- params["inner"] = "1"
- params["page"] = 1
while True:
data = self.request(url, params=params, headers=headers).json()
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 8c7ffe5..851f663 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -63,7 +63,8 @@ class NozomiExtractor(Extractor):
yield Message.Directory, post
for post["num"], image in enumerate(images, 1):
post["filename"] = post["dataid"] = did = image["dataid"]
- post["is_video"] = video = bool(image.get("is_video"))
+ post["is_video"] = video = \
+ True if image.get("is_video") else False
ext = image["type"]
if video:
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index d47ffa2..0b64ea3 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -56,6 +56,7 @@ class PatreonExtractor(Extractor):
text.nameext_from_url(name, post)
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
+ post["_ytdl_manifest"] = "hls"
post["extension"] = "mp4"
yield Message.Url, url, post
else:
@@ -310,7 +311,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
subcategory = "creator"
pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))"
- r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?")
+ r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?")
example = "https://www.patreon.com/USER"
def posts(self):
@@ -340,9 +341,9 @@ class PatreonCreatorExtractor(PatreonExtractor):
user_id = query.get("u")
if user_id:
- url = "{}/user/posts?u={}".format(self.root, user_id)
+ url = "{}/user?u={}".format(self.root, user_id)
else:
- url = "{}/{}/posts".format(self.root, creator)
+ url = "{}/{}".format(self.root, creator)
page = self.request(url, notfound="creator").text
try:
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 8c04ed5..499c579 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -18,8 +18,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+"
class PinterestExtractor(Extractor):
"""Base class for pinterest extractors"""
category = "pinterest"
- filename_fmt = "{category}_{id}{media_id:?_//}.{extension}"
- archive_fmt = "{id}{media_id}"
+ filename_fmt = "{category}_{id}{media_id|page_id:?_//}.{extension}"
+ archive_fmt = "{id}{media_id|page_id}"
root = "https://www.pinterest.com"
def _init(self):
@@ -30,12 +30,12 @@ class PinterestExtractor(Extractor):
self.root = text.ensure_http_scheme(domain)
self.api = PinterestAPI(self)
+ self.stories = self.config("stories", True)
+ self.videos = self.config("videos", True)
def items(self):
data = self.metadata()
- videos = self.config("videos", True)
- yield Message.Directory, data
for pin in self.pins():
if isinstance(pin, tuple):
@@ -43,40 +43,35 @@ class PinterestExtractor(Extractor):
yield Message.Queue, url, data
continue
+ try:
+ files = self._extract_files(pin)
+ except Exception as exc:
+ self.log.debug("", exc_info=exc)
+ self.log.warning(
+ "%s: Error when extracting download URLs (%s: %s)",
+ pin.get("id"), exc.__class__.__name__, exc)
+ continue
+
pin.update(data)
+ pin["count"] = len(files)
- carousel_data = pin.get("carousel_data")
- if carousel_data:
- pin["count"] = len(carousel_data["carousel_slots"])
- for num, slot in enumerate(carousel_data["carousel_slots"], 1):
- slot["media_id"] = slot.pop("id")
- pin.update(slot)
- pin["num"] = num
- size, image = next(iter(slot["images"].items()))
- url = image["url"].replace("/" + size + "/", "/originals/")
- yield Message.Url, url, text.nameext_from_url(url, pin)
-
- else:
- try:
- media = self._media_from_pin(pin)
- except Exception:
- self.log.debug("Unable to fetch download URL for pin %s",
- pin.get("id"))
- continue
+ yield Message.Directory, pin
+ for pin["num"], file in enumerate(files, 1):
+ url = file["url"]
+ text.nameext_from_url(url, pin)
+ pin.update(file)
- if videos or media.get("duration") is None:
- pin.update(media)
- pin["num"] = pin["count"] = 1
+ if "media_id" not in file:
pin["media_id"] = ""
+ if "page_id" not in file:
+ pin["page_id"] = ""
- url = media["url"]
- text.nameext_from_url(url, pin)
+ if pin["extension"] == "m3u8":
+ url = "ytdl:" + url
+ pin["_ytdl_manifest"] = "hls"
+ pin["extension"] = "mp4"
- if pin["extension"] == "m3u8":
- url = "ytdl:" + url
- pin["extension"] = "mp4"
-
- yield Message.Url, url, pin
+ yield Message.Url, url, pin
def metadata(self):
"""Return general metadata"""
@@ -84,26 +79,108 @@ class PinterestExtractor(Extractor):
def pins(self):
"""Return all relevant pin objects"""
- @staticmethod
- def _media_from_pin(pin):
+ def _extract_files(self, pin):
+ story_pin_data = pin.get("story_pin_data")
+ if story_pin_data and self.stories:
+ return self._extract_story(pin, story_pin_data)
+
+ carousel_data = pin.get("carousel_data")
+ if carousel_data:
+ return self._extract_carousel(pin, carousel_data)
+
videos = pin.get("videos")
- if videos:
- video_formats = videos["video_list"]
+ if videos and self.videos:
+ return (self._extract_video(videos),)
- for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"):
- if fmt in video_formats:
- media = video_formats[fmt]
- break
- else:
- media = max(video_formats.values(),
- key=lambda x: x.get("width", 0))
+ try:
+ return (pin["images"]["orig"],)
+ except Exception:
+ self.log.debug("%s: No files found", pin.get("id"))
+ return ()
+
+ def _extract_story(self, pin, story):
+ files = []
+ story_id = story.get("id")
+
+ for page in story["pages"]:
+ page_id = page.get("id")
+
+ for block in page["blocks"]:
+ type = block.get("type")
+
+ if type == "story_pin_image_block":
+ if 1 == len(page["blocks"]) == len(story["pages"]):
+ try:
+ media = pin["images"]["orig"]
+ except Exception:
+ media = self._extract_image(page, block)
+ else:
+ media = self._extract_image(page, block)
+
+ elif type == "story_pin_video_block":
+ video = block["video"]
+ media = self._extract_video(video)
+ media["media_id"] = video.get("id") or ""
+
+ elif type == "story_pin_paragraph_block":
+ media = {"url": "text:" + block["text"],
+ "extension": "txt",
+ "media_id": block.get("id")}
+
+ else:
+ self.log.warning("%s: Unsupported story block '%s'",
+ pin.get("id"), type)
+ continue
- if "V_720P" in video_formats:
- media["_fallback"] = (video_formats["V_720P"]["url"],)
+ media["story_id"] = story_id
+ media["page_id"] = page_id
+ files.append(media)
+
+ return files
+
+ def _extract_carousel(self, pin, carousel_data):
+ files = []
+ for slot in carousel_data["carousel_slots"]:
+ size, image = next(iter(slot["images"].items()))
+ slot["media_id"] = slot.pop("id")
+ slot["url"] = image["url"].replace(
+ "/" + size + "/", "/originals/", 1)
+ files.append(slot)
+ return files
+
+ def _extract_image(self, page, block):
+ sig = block.get("image_signature") or page["image_signature"]
+ url_base = "https://i.pinimg.com/originals/{}/{}/{}/{}.".format(
+ sig[0:2], sig[2:4], sig[4:6], sig)
+ url_jpg = url_base + "jpg"
+ url_png = url_base + "png"
+ url_webp = url_base + "webp"
- return media
+ try:
+ media = block["image"]["images"]["originals"]
+ except Exception:
+ media = {"url": url_jpg, "_fallback": (url_png, url_webp,)}
- return pin["images"]["orig"]
+ if media["url"] == url_jpg:
+ media["_fallback"] = (url_png, url_webp,)
+ else:
+ media["_fallback"] = (url_jpg, url_png, url_webp,)
+ media["media_id"] = sig
+
+ return media
+
+ def _extract_video(self, video):
+ video_formats = video["video_list"]
+ for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"):
+ if fmt in video_formats:
+ media = video_formats[fmt]
+ break
+ else:
+ media = max(video_formats.values(),
+ key=lambda x: x.get("width", 0))
+ if "V_720P" in video_formats:
+ media["_fallback"] = (video_formats["V_720P"]["url"],)
+ return media
class PinterestPinExtractor(PinterestExtractor):
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index c2d1243..8c6e6d8 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -38,6 +38,7 @@ class PixivExtractor(Extractor):
self.meta_user = self.config("metadata")
self.meta_bookmark = self.config("metadata-bookmark")
self.meta_comments = self.config("comments")
+ self.meta_captions = self.config("captions")
def items(self):
tags = self.config("tags", "japanese")
@@ -76,8 +77,8 @@ class PixivExtractor(Extractor):
detail = self.api.illust_bookmark_detail(work["id"])
work["tags_bookmark"] = [tag["name"] for tag in detail["tags"]
if tag["is_registered"]]
- if self.sanity_workaround and not work.get("caption") and \
- not work.get("_mypixiv"):
+ if self.meta_captions and not work.get("caption") and \
+ not work.get("_mypixiv") and not work.get("_ajax"):
body = self._request_ajax("/illust/" + str(work["id"]))
if body:
work["caption"] = text.unescape(body["illustComment"])
@@ -108,10 +109,10 @@ class PixivExtractor(Extractor):
if self.load_ugoira:
try:
return self._extract_ugoira(work)
- except exception.StopExtraction as exc:
+ except Exception as exc:
self.log.warning(
- "Unable to retrieve Ugoira metatdata (%s - %s)",
- work["id"], exc.message)
+ "%s: Unable to retrieve Ugoira metatdata (%s - %s)",
+ work["id"], exc.__class__.__name__, exc)
elif work["page_count"] == 1:
url = meta_single_page["original_image_url"]
@@ -186,6 +187,7 @@ class PixivExtractor(Extractor):
return None
def _extract_ajax(self, work, body):
+ work["_ajax"] = True
url = self._extract_ajax_url(body)
if not url:
return ()
@@ -243,12 +245,12 @@ class PixivExtractor(Extractor):
original = body["urls"]["original"]
if original:
return original
- except KeyError:
+ except Exception:
pass
try:
square1200 = body["userIllusts"][body["id"]]["url"]
- except KeyError:
+ except Exception:
return
parts = square1200.rpartition("_p0")[0].split("/")
del parts[3:5]
@@ -293,9 +295,6 @@ class PixivExtractor(Extractor):
"x_restrict" : 0,
}
- def _web_to_mobile(self, work):
- return work
-
def works(self):
"""Return an iterable containing all relevant 'work' objects"""
@@ -334,15 +333,17 @@ class PixivUserExtractor(PixivExtractor):
class PixivArtworksExtractor(PixivExtractor):
"""Extractor for artworks of a pixiv user"""
subcategory = "artworks"
+ _warning = True
pattern = (BASE_PATTERN + r"/(?:"
r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)"
r"(?:/([^/?#]+))?/?(?:$|[?#])"
r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)")
example = "https://www.pixiv.net/en/users/12345/artworks"
- def __init__(self, match):
- PixivExtractor.__init__(self, match)
- u1, t1, u2, t2 = match.groups()
+ def _init(self):
+ PixivExtractor._init(self)
+
+ u1, t1, u2, t2 = self.groups
if t1:
t1 = text.unquote(t1)
elif t2:
@@ -350,6 +351,14 @@ class PixivArtworksExtractor(PixivExtractor):
self.user_id = u1 or u2
self.tag = t1 or t2
+ if self.sanity_workaround:
+ self.cookies_domain = d = ".pixiv.net"
+ self._init_cookies()
+ if self._warning and not self.cookies.get("PHPSESSID", domain=d):
+ PixivArtworksExtractor._warning = False
+ self.log.warning("No 'PHPSESSID' cookie set. Can detect only "
+ "non R-18 'sanity_level' works.")
+
def metadata(self):
if self.config("metadata"):
self.api.user_detail(self.user_id)
@@ -358,6 +367,19 @@ class PixivArtworksExtractor(PixivExtractor):
def works(self):
works = self.api.user_illusts(self.user_id)
+ if self.sanity_workaround:
+ body = self._request_ajax(
+ "/user/{}/profile/all".format(self.user_id))
+ try:
+ ajax_ids = list(map(int, body["illusts"]))
+ ajax_ids.extend(map(int, body["manga"]))
+ ajax_ids.sort()
+ except Exception as exc:
+ self.log.warning("Unable to collect artwork IDs using AJAX "
+ "API (%s: %s)", exc.__class__.__name__, exc)
+ else:
+ works = self._extend_sanity(works, ajax_ids)
+
if self.tag:
tag = self.tag.lower()
works = (
@@ -367,6 +389,35 @@ class PixivArtworksExtractor(PixivExtractor):
return works
+ def _extend_sanity(self, works, ajax_ids):
+ user = {"id": 1}
+ index = len(ajax_ids) - 1
+
+ for work in works:
+ while index >= 0:
+ work_id = work["id"]
+ ajax_id = ajax_ids[index]
+
+ if ajax_id == work_id:
+ index -= 1
+ break
+
+ elif ajax_id > work_id:
+ index -= 1
+ self.log.debug("Inserting work %s", ajax_id)
+ yield self._make_work(ajax_id, self.sanity_url, user)
+
+ else: # ajax_id < work_id
+ break
+
+ yield work
+
+ while index >= 0:
+ ajax_id = ajax_ids[index]
+ self.log.debug("Inserting work %s", ajax_id)
+ yield self._make_work(ajax_id, self.sanity_url, user)
+ index -= 1
+
class PixivAvatarExtractor(PixivExtractor):
"""Extractor for pixiv avatars"""
diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py
index 29b351b..8877175 100644
--- a/gallery_dl/extractor/postmill.py
+++ b/gallery_dl/extractor/postmill.py
@@ -50,7 +50,7 @@ class PostmillExtractor(BaseExtractor):
forum = match.group(1)
id = int(match.group(2))
- is_text_post = url.startswith("/")
+ is_text_post = (url[0] == "/")
is_image_post = self._search_image_tag(page) is not None
data = {
"title": title,
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index ce602f6..8577e74 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -31,6 +31,7 @@ class RedditExtractor(Extractor):
parentdir = self.config("parent-directory")
max_depth = self.config("recursion", 0)
previews = self.config("previews", True)
+ embeds = self.config("embeds", True)
videos = self.config("videos", True)
if videos:
@@ -100,7 +101,7 @@ class RedditExtractor(Extractor):
for comment in comments:
html = comment["body_html"] or ""
href = (' href="' in html)
- media = ("media_metadata" in comment)
+ media = (embeds and "media_metadata" in comment)
if media or href:
comment["date"] = text.parse_timestamp(
@@ -211,8 +212,9 @@ class RedditExtractor(Extractor):
def _extract_video_dash(self, submission):
submission["_ytdl_extra"] = {"title": submission["title"]}
try:
- return (submission["secure_media"]["reddit_video"]["dash_url"] +
- "#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D")
+ url = submission["secure_media"]["reddit_video"]["dash_url"]
+ submission["_ytdl_manifest"] = "dash"
+ return url
except Exception:
return submission["url"]
diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py
new file mode 100644
index 0000000..9f9f0c4
--- /dev/null
+++ b/gallery_dl/extractor/scrolller.py
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://scrolller.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?scrolller\.com"
+
+
+class ScrolllerExtractor(Extractor):
+ """Base class for scrolller extractors"""
+ category = "scrolller"
+ root = "https://scrolller.com"
+ directory_fmt = ("{category}", "{subredditTitle}")
+ filename_fmt = "{id}{title:? //}.{extension}"
+ archive_fmt = "{id}"
+ request_interval = (0.5, 1.5)
+
+ def _init(self):
+ self.auth_token = None
+
+ def items(self):
+ self.login()
+
+ for post in self.posts():
+
+ src = max(post["mediaSources"], key=self._sort_key)
+ post.update(src)
+ url = src["url"]
+ text.nameext_from_url(url, post)
+
+ yield Message.Directory, post
+ yield Message.Url, url, post
+
+ def posts(self):
+ return ()
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self.auth_token = self._login_impl(username, password)
+
+ @cache(maxage=28*86400, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ variables = {
+ "username": username,
+ "password": password,
+ }
+
+ try:
+ data = self._request_graphql("LoginQuery", variables)
+ except exception.HttpError as exc:
+ if exc.status == 403:
+ raise exception.AuthenticationError()
+ raise
+
+ return data["login"]["token"]
+
+ def _request_graphql(self, opname, variables):
+ url = "https://api.scrolller.com/api/v2/graphql"
+ headers = {
+ "Content-Type" : "text/plain;charset=UTF-8",
+ "Origin" : self.root,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-site",
+ }
+ data = {
+ "query" : QUERIES[opname],
+ "variables" : variables,
+ "authorization": self.auth_token,
+ }
+ return self.request(
+ url, method="POST", headers=headers, data=util.json_dumps(data),
+ ).json()["data"]
+
+ def _pagination(self, opname, variables):
+ while True:
+ data = self._request_graphql(opname, variables)
+
+ while "items" not in data:
+ data = data.popitem()[1]
+ yield from data["items"]
+
+ if not data["iterator"]:
+ return
+ variables["iterator"] = data["iterator"]
+
+ def _sort_key(self, src):
+ return src["width"], not src["isOptimized"]
+
+
+class ScrolllerSubredditExtractor(ScrolllerExtractor):
+ """Extractor for media from a scrolller subreddit"""
+ subcategory = "subreddit"
+ pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?"
+ example = "https://scrolller.com/r/SUBREDDIT"
+
+ def posts(self):
+ url, query = self.groups
+ filter = None
+
+ if query:
+ params = text.parse_query(query)
+ if "filter" in params:
+ filter = params["filter"].upper().rstrip("S")
+
+ variables = {
+ "url" : url,
+ "iterator" : None,
+ "filter" : filter,
+ "hostsDown": None,
+ }
+ return self._pagination("SubredditQuery", variables)
+
+
+class ScrolllerFollowingExtractor(ScrolllerExtractor):
+ """Extractor for followed scrolller subreddits"""
+ subcategory = "following"
+ pattern = BASE_PATTERN + r"/following"
+ example = "https://scrolller.com/following"
+
+ def items(self):
+ self.login()
+
+ if not self.auth_token:
+ raise exception.AuthorizationError("Login required")
+
+ variables = {
+ "iterator" : None,
+ "hostsDown": None,
+ }
+
+ for subreddit in self._pagination("FollowingQuery", variables):
+ url = self.root + subreddit["url"]
+ subreddit["_extractor"] = ScrolllerSubredditExtractor
+ yield Message.Queue, url, subreddit
+
+
+class ScrolllerPostExtractor(ScrolllerExtractor):
+ """Extractor for media from a single scrolller post"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)"
+ example = "https://scrolller.com/title-slug-a1b2c3d4f5"
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.groups[0])
+ page = self.request(url).text
+ data = util.json_loads(text.extr(
+ page, '<script>window.scrolllerConfig="', '"</script>')
+ .replace('\\"', '"'))
+ return (data["item"],)
+
+
+QUERIES = {
+
+ "SubredditQuery": """\
+query SubredditQuery(
+ $url: String!
+ $filter: SubredditPostFilter
+ $iterator: String
+) {
+ getSubreddit(
+ url: $url
+ ) {
+ children(
+ limit: 50
+ iterator: $iterator
+ filter: $filter
+ disabledHosts: null
+ ) {
+ iterator items {
+ __typename id url title subredditId subredditTitle
+ subredditUrl redditPath isNsfw albumUrl hasAudio
+ fullLengthSource gfycatSource redgifsSource ownerAvatar
+ username displayName isPaid tags isFavorite
+ mediaSources { url width height isOptimized }
+ blurredMediaSources { url width height isOptimized }
+ }
+ }
+ }
+}
+""",
+
+ "FollowingQuery": """\
+query FollowingQuery(
+ $iterator: String
+) {
+ getFollowing(
+ limit: 10
+ iterator: $iterator
+ ) {
+ iterator items {
+ __typename id url title secondaryTitle description createdAt isNsfw
+ subscribers isComplete itemCount videoCount pictureCount albumCount
+ isPaid username tags isFollowing
+ banner { url width height isOptimized }
+ }
+ }
+}
+""",
+
+ "LoginQuery": """\
+query LoginQuery(
+ $username: String!,
+ $password: String!
+) {
+ login(
+ username: $username,
+ password: $password
+ ) {
+ username token expiresAt isAdmin status isPremium
+ }
+}
+""",
+
+}
diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py
index dd5988f..468840b 100644
--- a/gallery_dl/extractor/telegraph.py
+++ b/gallery_dl/extractor/telegraph.py
@@ -49,7 +49,7 @@ class TelegraphGalleryExtractor(GalleryExtractor):
url, pos = text.extract(figure, 'src="', '"')
if url.startswith("/embed/"):
continue
- elif url.startswith("/"):
+ elif url[0] == "/":
url = self.root + url
caption, pos = text.extract(figure, "<figcaption>", "<", pos)
num += 1
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index bce661a..b196aeb 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -148,8 +148,10 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
data["PageNumber"] += 1
def _parse(self, query):
+ if not query:
+ return {}
try:
- if query.startswith("?"):
+ if query[0] == "?":
return self._parse_simple(query)
return self._parse_jsurl(query)
except Exception as exc:
@@ -187,8 +189,6 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill))
Ref: https://github.com/Sage/jsurl
"""
- if not data:
- return {}
i = 0
imax = len(data)
diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py
index b21709a..f7ce44b 100644
--- a/gallery_dl/extractor/urlgalleries.py
+++ b/gallery_dl/extractor/urlgalleries.py
@@ -7,7 +7,7 @@
"""Extractors for https://urlgalleries.net/"""
from .common import GalleryExtractor, Message
-from .. import text
+from .. import text, exception
class UrlgalleriesGalleryExtractor(GalleryExtractor):
@@ -16,27 +16,31 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
root = "urlgalleries.net"
request_interval = (0.5, 1.0)
pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)"
- example = "https://blog.urlgalleries.net/gallery-12345/TITLE"
+ example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE"
- def __init__(self, match):
- self.blog, self.gallery_id = match.groups()
+ def items(self):
+ blog, self.gallery_id = self.groups
url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format(
- self.blog, self.gallery_id)
- GalleryExtractor.__init__(self, match, url)
+ blog, self.gallery_id)
+
+ with self.request(url, allow_redirects=False, fatal=...) as response:
+ if 300 <= response.status_code < 500:
+ if response.headers.get("location", "").endswith(
+ "/not_found_adult.php"):
+ raise exception.NotFoundError("gallery")
+ raise exception.HttpError(None, response)
+ page = response.text
- def items(self):
- page = self.request(self.gallery_url).text
imgs = self.images(page)
data = self.metadata(page)
data["count"] = len(imgs)
- del page
- root = "https://{}.urlgalleries.net".format(self.blog)
+ root = "https://{}.urlgalleries.net".format(blog)
yield Message.Directory, data
for data["num"], img in enumerate(imgs, 1):
- response = self.request(
- root + img, method="HEAD", allow_redirects=False)
- yield Message.Queue, response.headers["Location"], data
+ page = self.request(root + img).text
+ url = text.extr(page, "window.location.href = '", "'")
+ yield Message.Queue, url, data
def metadata(self, page):
extr = text.extract_from(page)
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index 95eeafe..ea034a7 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -24,6 +24,13 @@ class VkExtractor(Extractor):
root = "https://vk.com"
request_interval = (0.5, 1.5)
+ def _init(self):
+ self.offset = text.parse_int(self.config("offset"))
+
+ def skip(self, num):
+ self.offset += num
+ return num
+
def items(self):
sub = re.compile(r"/imp[fg]/").sub
sizes = "wzyxrqpo"
@@ -75,7 +82,7 @@ class VkExtractor(Extractor):
"al" : "1",
"direction": "1",
"list" : photos_id,
- "offset" : 0,
+ "offset" : self.offset,
}
while True:
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index 116f557..4eae537 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -193,7 +193,10 @@ class WikimediaArticleExtractor(WikimediaExtractor):
def __init__(self, match):
WikimediaExtractor.__init__(self, match)
- path = match.group(match.lastindex)
+ path = self.groups[-1]
+ if path[2] == "/":
+ self.root = self.root + "/" + path[:2]
+ path = path[3:]
if path.startswith("wiki/"):
path = path[5:]