diff options
| author | 2024-10-25 17:27:30 -0400 | |
|---|---|---|
| committer | 2024-10-25 17:27:30 -0400 | |
| commit | fc004701f923bb954a22c7fec2ae8d607e78cb2b (patch) | |
| tree | a5bea4ed6447ea43c099131430e3bd6182ee87d7 /gallery_dl/extractor | |
| parent | 0db541f524e1774865efebcbe5653e9ad76ea2e8 (diff) | |
New upstream version 1.27.7.upstream/1.27.7
Diffstat (limited to 'gallery_dl/extractor')
28 files changed, 887 insertions, 334 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index f81d2a1..ce1c52a 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,12 +27,22 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - now = util.datetime_utcnow() - domain = self.root.rpartition("/")[2] - self.cookies.set( - now.strftime("TOS%Y%m%d"), "1", domain=domain) - self.cookies.set( - (now - timedelta(1)).strftime("TOS%Y%m%d"), "1", domain=domain) + tos = self.cookies_tos_name() + self.cookies.set(tos, "1", domain=self.root[8:]) + + @memcache() + def cookies_tos_name(self): + url = self.root + "/.static/pages/confirmed.html" + headers = {"Referer": self.root + "/.static/pages/disclaimer.html"} + response = self.request(url, headers=headers, allow_redirects=False) + + for cookie in response.cookies: + if cookie.name.lower().startswith("tos"): + self.log.debug("TOS cookie name: %s", cookie.name) + return cookie.name + + self.log.error("Unable to determin TOS cookie name") + return "TOS20241009" @memcache() def cookies_prepare(self): @@ -64,16 +74,14 @@ class _8chanThreadExtractor(_8chanExtractor): "{threadId} {subject[:50]}") filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}" archive_fmt = "{boardUri}_{postId}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)" example = "https://8chan.moe/a/res/12345.html" - def __init__(self, match): - _8chanExtractor.__init__(self, match) - _, self.board, self.thread = match.groups() - def items(self): + _, board, thread = self.groups + # fetch thread data - url = "{}/{}/res/{}.".format(self.root, self.board, self.thread) + url = "{}/{}/res/{}.".format(self.root, board, thread) self.session.headers["Referer"] = url + "html" thread = self.request(url + "json").json() thread["postId"] = thread["threadId"] @@ -106,25 +114,22 @@ class _8chanBoardExtractor(_8chanExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$" example = "https://8chan.moe/a/" - def __init__(self, match): - _8chanExtractor.__init__(self, match) - _, self.board, self.page = match.groups() - def items(self): - page = text.parse_int(self.page, 1) - url = "{}/{}/{}.json".format(self.root, self.board, page) - board = self.request(url).json() - threads = board["threads"] + _, board, pnum = self.groups + pnum = text.parse_int(pnum, 1) + url = "{}/{}/{}.json".format(self.root, board, pnum) + data = self.request(url).json() + threads = data["threads"] while True: for thread in threads: thread["_extractor"] = _8chanThreadExtractor url = "{}/{}/res/{}.html".format( - self.root, self.board, thread["threadId"]) + self.root, board, thread["threadId"]) yield Message.Queue, url, thread - page += 1 - if page > board["pageCount"]: + pnum += 1 + if pnum > data["pageCount"]: return - url = "{}/{}/{}.json".format(self.root, self.board, page) + url = "{}/{}/{}.json".format(self.root, board, pnum) threads = self.request(url).json()["threads"] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9885195..4e9fa50 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -141,6 +141,7 @@ modules = [ "rule34us", "sankaku", "sankakucomplex", + "scrolller", "seiga", "senmanga", "sexcom", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 72f9195..14598b7 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -171,6 +171,7 @@ class BehanceGalleryExtractor(BehanceExtractor): url = text.extr(page, '<source src="', '"') if text.ext_from_url(url) == "m3u8": url = "ytdl:" + url + module["_ytdl_manifest"] = "hls" module["extension"] = "mp4" append((url, module)) continue diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 39c5635..a1a488e 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -42,62 +42,76 @@ class BlueskyExtractor(Extractor): self._user = self._user_did = None self.instance = self.root.partition("://")[2] self.videos = self.config("videos", True) + self.quoted = self.config("quoted", False) def items(self): for post in self.posts(): if "post" in post: post = post["post"] - - pid = post["uri"].rpartition("/")[2] if self._user_did and post["author"]["did"] != self._user_did: - self.log.debug("Skipping %s (repost)", pid) - continue - - post.update(post["record"]) - del post["record"] - - if self._metadata_facets: - if "facets" in post: - post["hashtags"] = tags = [] - post["mentions"] = dids = [] - post["uris"] = uris = [] - for facet in post["facets"]: - features = facet["features"][0] - if "tag" in features: - tags.append(features["tag"]) - elif "did" in features: - dids.append(features["did"]) - elif "uri" in features: - uris.append(features["uri"]) - else: - post["hashtags"] = post["mentions"] = post["uris"] = () - - if self._metadata_user: - post["user"] = self._user or post["author"] - - files = self._extract_files(post) - post["instance"] = self.instance - post["post_id"] = pid - post["count"] = len(files) - post["date"] = text.parse_datetime( - post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") - - yield Message.Directory, post - - if not files: + self.log.debug("Skipping %s (repost)", self._pid(post)) continue - - base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" - "?did={}&cid=".format(post["author"]["did"])) - for post["num"], file in enumerate(files, 1): - post.update(file) - yield Message.Url, base + file["filename"], post + embed = post.get("embed") + post.update(post.pop("record")) + + while True: + self._prepare(post) + files = self._extract_files(post) + + yield Message.Directory, post + if files: + base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" + "?did={}&cid=".format(post["author"]["did"])) + for post["num"], file in enumerate(files, 1): + post.update(file) + yield Message.Url, base + file["filename"], post + + if not self.quoted or not embed or "record" not in embed: + break + + quote = embed["record"] + if "record" in quote: + quote = quote["record"] + quote["quote_id"] = self._pid(post) + quote["quote_by"] = post["author"] + embed = quote.get("embed") + quote.update(quote.pop("value")) + post = quote def posts(self): return () + def _pid(self, post): + return post["uri"].rpartition("/")[2] + + def _prepare(self, post): + if self._metadata_facets: + if "facets" in post: + post["hashtags"] = tags = [] + post["mentions"] = dids = [] + post["uris"] = uris = [] + for facet in post["facets"]: + features = facet["features"][0] + if "tag" in features: + tags.append(features["tag"]) + elif "did" in features: + dids.append(features["did"]) + elif "uri" in features: + uris.append(features["uri"]) + else: + post["hashtags"] = post["mentions"] = post["uris"] = () + + if self._metadata_user: + post["user"] = self._user or post["author"] + + post["instance"] = self.instance + post["post_id"] = self._pid(post) + post["date"] = text.parse_datetime( + post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") + def _extract_files(self, post): if "embed" not in post: + post["count"] = 0 return () files = [] @@ -111,6 +125,7 @@ class BlueskyExtractor(Extractor): if "video" in media and self.videos: files.append(self._extract_media(media, "video")) + post["count"] = len(files) return files def _extract_media(self, media, key): diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 9022ffc..6c79d0a 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -8,9 +8,10 @@ """Extractors for https://bunkr.si/""" +from .common import Extractor from .lolisafe import LolisafeAlbumExtractor -from .. import text, config - +from .. import text, config, exception +import random if config.get(("extractor", "bunkr"), "tlds"): BASE_PATTERN = ( @@ -21,11 +22,28 @@ else: BASE_PATTERN = ( r"(?:bunkr:(?:https?://)?([^/?#]+)|" r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]" + r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]" r"|black|cat|media|red|site|ws|org)))" ) +DOMAINS = [ + "bunkr.ac", + "bunkr.ci", + "bunkr.fi", + "bunkr.ph", + "bunkr.pk", + "bunkr.ps", + "bunkr.si", + "bunkr.sk", + "bunkr.ws", + "bunkr.black", + "bunkr.red", + "bunkr.media", + "bunkr.site", +] LEGACY_DOMAINS = { + "bunkr.ax", + "bunkr.cat", "bunkr.ru", "bunkrr.ru", "bunkr.su", @@ -34,6 +52,7 @@ LEGACY_DOMAINS = { "bunkr.is", "bunkr.to", } +CF_DOMAINS = set() class BunkrAlbumExtractor(LolisafeAlbumExtractor): @@ -49,45 +68,96 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): if domain not in LEGACY_DOMAINS: self.root = "https://" + domain + def request(self, url, **kwargs): + kwargs["allow_redirects"] = False + + while True: + try: + response = Extractor.request(self, url, **kwargs) + if response.status_code < 300: + return response + + # redirect + url = response.headers["Location"] + root, path = self._split(url) + if root not in CF_DOMAINS: + continue + self.log.debug("Redirect to known CF challenge domain '%s'", + root) + + except exception.HttpError as exc: + if exc.status != 403: + raise + + # CF challenge + root, path = self._split(url) + CF_DOMAINS.add(root) + self.log.debug("Added '%s' to CF challenge domains", root) + + try: + DOMAINS.remove(root.rpartition("/")[2]) + except ValueError: + pass + else: + if not DOMAINS: + raise exception.StopExtraction( + "All Bunkr domains require solving a CF challenge") + + # select alternative domain + root = "https://" + random.choice(DOMAINS) + self.log.debug("Trying '%s' as fallback", root) + url = root + path + def fetch_album(self, album_id): # album metadata page = self.request(self.root + "/a/" + self.album_id).text - info = text.split_html(text.extr( - page, "<h1", "</div>").partition(">")[2]) - count, _, size = info[1].split(None, 2) + title, size = text.split_html(text.extr( + page, "<h1", "</span>").partition(">")[2]) - pos = page.index('class="grid-images') - urls = list(text.extract_iter(page, '<a href="', '"', pos)) - - return self._extract_files(urls), { + items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->")) + return self._extract_files(items), { "album_id" : self.album_id, - "album_name" : text.unescape(info[0]), - "album_size" : size[1:-1], - "count" : len(urls), - "_http_validate": self._validate, + "album_name" : title, + "album_size" : text.extr(size, "(", ")"), + "count" : len(items), } - def _extract_files(self, urls): - for url in urls: + def _extract_files(self, items): + for item in items: try: - url = self._extract_file(text.unescape(url)) + url = text.extr(item, ' href="', '"') + file = self._extract_file(text.unescape(url)) + + info = text.split_html(item) + file["name"] = info[0] + file["size"] = info[2] + file["date"] = text.parse_datetime( + info[-1], "%H:%M:%S %d/%m/%Y") + + yield file + except exception.StopExtraction: + raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) - continue - yield {"file": text.unescape(url)} - - def _extract_file(self, url): - page = self.request(url).text - url = (text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"')) - - if not url: - url_download = text.rextract( - page, ' href="', '"', page.rindex("Download"))[0] - page = self.request(text.unescape(url_download)).text - url = text.unescape(text.rextract(page, ' href="', '"')[0]) - - return url + self.log.debug("", exc_info=exc) + + def _extract_file(self, webpage_url): + response = self.request(webpage_url) + page = response.text + file_url = (text.extr(page, '<source src="', '"') or + text.extr(page, '<img src="', '"')) + + if not file_url: + webpage_url = text.unescape(text.rextract( + page, ' href="', '"', page.rindex("Download"))[0]) + response = self.request(webpage_url) + file_url = text.rextract(response.text, ' href="', '"')[0] + + return { + "file" : text.unescape(file_url), + "_http_headers" : {"Referer": response.url}, + "_http_validate": self._validate, + } def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): @@ -95,6 +165,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): return False return True + def _split(self, url): + pos = url.index("/", 8) + return url[:pos], url[pos:] + class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.si media links""" @@ -105,16 +179,15 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): def fetch_album(self, album_id): try: - url = self._extract_file(self.root + self.album_id) + file = self._extract_file(self.root + album_id) except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) return (), {} - return ({"file": text.unescape(url)},), { + return (file,), { "album_id" : "", "album_name" : "", "album_size" : -1, "description": "", "count" : 1, - "_http_validate": self._validate, } diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 725af3a..0b1e44a 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -9,7 +9,7 @@ """Extractors for https://www.civitai.com/""" from .common import Extractor, Message -from .. import text, util +from .. import text, util, exception import itertools import time @@ -23,7 +23,7 @@ class CivitaiExtractor(Extractor): root = "https://civitai.com" directory_fmt = ("{category}", "{username|user[username]}", "images") filename_fmt = "{file[id]|id|filename}.{extension}" - archive_fmt = "{file[hash]|hash}" + archive_fmt = "{file[uuid]|uuid}" request_interval = (0.5, 1.5) def _init(self): @@ -101,9 +101,11 @@ class CivitaiExtractor(Extractor): def _url(self, image): url = image["url"] if "/" in url: - parts = url.rsplit("/", 2) - parts[1] = self._image_quality + parts = url.rsplit("/", 3) + image["uuid"] = parts[1] + parts[2] = self._image_quality return "/".join(parts) + image["uuid"] = url name = image.get("name") if not name: @@ -133,8 +135,6 @@ class CivitaiModelExtractor(CivitaiExtractor): directory_fmt = ("{category}", "{user[username]}", "{model[id]}{model[name]:? //}", "{version[id]}{version[name]:? //}") - filename_fmt = "{file[id]}.{extension}" - archive_fmt = "{file[hash]}" pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?" example = "https://civitai.com/models/12345/TITLE" @@ -195,19 +195,25 @@ class CivitaiModelExtractor(CivitaiExtractor): ) def _extract_files_model(self, model, version, user): - return [ - { + files = [] + + for num, file in enumerate(version["files"], 1): + file["uuid"] = "model-{}-{}-{}".format( + model["id"], version["id"], file["id"]) + files.append({ "num" : num, "file" : file, "filename" : file["name"], "extension": "bin", - "url" : file["downloadUrl"], + "url" : file.get("downloadUrl") or + "{}/api/download/models/{}".format( + self.root, version["id"]), "_http_headers" : { "Authorization": self.api.headers.get("Authorization")}, "_http_validate": self._validate_file_model, - } - for num, file in enumerate(version["files"], 1) - ] + }) + + return files def _extract_files_image(self, model, version, user): if "images" in version: @@ -263,24 +269,14 @@ class CivitaiPostExtractor(CivitaiExtractor): return ({"id": int(self.groups[0])},) -class CivitaiTagModelsExtractor(CivitaiExtractor): - subcategory = "tag-models" - pattern = BASE_PATTERN + r"/(?:tag/|models\?tag=)([^/?&#]+)" +class CivitaiTagExtractor(CivitaiExtractor): + subcategory = "tag" + pattern = BASE_PATTERN + r"/tag/([^/?&#]+)" example = "https://civitai.com/tag/TAG" def models(self): tag = text.unquote(self.groups[0]) - return self.api.models({"tag": tag}) - - -class CivitaiTagImagesExtractor(CivitaiExtractor): - subcategory = "tag-images" - pattern = BASE_PATTERN + r"/images\?tags=([^&#]+)" - example = "https://civitai.com/images?tags=12345" - - def images(self): - tag = text.unquote(self.groups[0]) - return self.api.images({"tag": tag}) + return self.api.models_tag(tag) class CivitaiSearchExtractor(CivitaiExtractor): @@ -293,6 +289,26 @@ class CivitaiSearchExtractor(CivitaiExtractor): return self.api.models(params) +class CivitaiModelsExtractor(CivitaiExtractor): + subcategory = "models" + pattern = BASE_PATTERN + r"/models(?:/?\?([^#]+))?(?:$|#)" + example = "https://civitai.com/models" + + def models(self): + params = text.parse_query(self.groups[0]) + return self.api.models(params) + + +class CivitaiImagesExtractor(CivitaiExtractor): + subcategory = "images" + pattern = BASE_PATTERN + r"/images(?:/?\?([^#]+))?(?:$|#)" + example = "https://civitai.com/images" + + def images(self): + params = text.parse_query(self.groups[0]) + return self.api.images(params) + + class CivitaiUserExtractor(CivitaiExtractor): subcategory = "user" pattern = USER_PATTERN + r"/?(?:$|\?|#)" @@ -339,11 +355,35 @@ class CivitaiUserImagesExtractor(CivitaiExtractor): pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/images" + def __init__(self, match): + self.params = text.parse_query_list(match.group(2)) + if self.params.get("section") == "reactions": + self.subcategory = "reactions" + self.images = self.images_reactions + CivitaiExtractor.__init__(self, match) + def images(self): - params = text.parse_query(self.groups[1]) + params = self.params params["username"] = text.unquote(self.groups[0]) return self.api.images(params) + def images_reactions(self): + if "Authorization" not in self.api.headers and \ + not self.cookies.get( + "__Secure-civitai-token", domain=".civitai.com"): + raise exception.AuthorizationError("api-key or cookies required") + + params = self.params + params["authed"] = True + params["useIndex"] = False + if "reactions" in params: + if isinstance(params["reactions"], str): + params["reactions"] = (params["reactions"],) + else: + params["reactions"] = ( + "Like", "Dislike", "Heart", "Laugh", "Cry") + return self.api.images(params) + class CivitaiRestAPI(): """Interface for the Civitai Public REST API @@ -396,6 +436,9 @@ class CivitaiRestAPI(): def models(self, params): return self._pagination("/v1/models", params) + def models_tag(self, tag): + return self.models({"tag": tag}) + def _call(self, endpoint, params=None): if endpoint[0] == "/": url = self.root + endpoint @@ -419,14 +462,14 @@ class CivitaiRestAPI(): class CivitaiTrpcAPI(): - """Interface for the Civitai TRPC API""" + """Interface for the Civitai tRPC API""" def __init__(self, extractor): self.extractor = extractor self.root = extractor.root + "/api/trpc/" self.headers = { "content-type" : "application/json", - "x-client-version": "5.0.146", + "x-client-version": "5.0.185", "x-client-date" : "", "x-client" : "web", "x-fingerprint" : "undefined", @@ -463,6 +506,7 @@ class CivitaiTrpcAPI(): "include" : ["cosmetics"], }) + params = self._type_params(params) return self._pagination(endpoint, params) def images_gallery(self, model, version, user): @@ -516,6 +560,9 @@ class CivitaiTrpcAPI(): return self._pagination(endpoint, params) + def models_tag(self, tag): + return self.models({"tagname": tag}) + def post(self, post_id): endpoint = "post.get" params = {"id": int(post_id)} @@ -580,3 +627,13 @@ class CivitaiTrpcAPI(): def _merge_params(self, params_user, params_default): params_default.update(params_user) return params_default + + def _type_params(self, params): + for key, type in ( + ("tags" , int), + ("modelId" , int), + ("modelVersionId", int), + ): + if key in params: + params[key] = type(params[key]) + return params diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py index 4722a4f..0524239 100644 --- a/gallery_dl/extractor/cohost.py +++ b/gallery_dl/extractor/cohost.py @@ -109,7 +109,7 @@ class CohostUserExtractor(CohostExtractor): "projectHandle": self.groups[0], "page": 0, "options": { - "pinnedPostsAtTop" : bool(self.pinned), + "pinnedPostsAtTop" : True if self.pinned else False, "hideReplies" : not self.replies, "hideShares" : not self.shares, "hideAsks" : not self.asks, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 32c8e67..2146fa6 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -185,7 +185,9 @@ class Extractor(): self._dump_response(response) if ( code < 400 or - code < 500 and (not fatal and code != 429 or fatal is None) + code < 500 and ( + not fatal and code != 429 or fatal is None) or + fatal is ... ): if encoding: response.encoding = encoding @@ -454,46 +456,49 @@ class Extractor(): cookies = random.choice(cookies) self.cookies_load(cookies) - def cookies_load(self, cookies): - if isinstance(cookies, dict): - self.cookies_update_dict(cookies, self.cookies_domain) + def cookies_load(self, cookies_source): + if isinstance(cookies_source, dict): + self.cookies_update_dict(cookies_source, self.cookies_domain) - elif isinstance(cookies, str): - path = util.expand_path(cookies) + elif isinstance(cookies_source, str): + path = util.expand_path(cookies_source) try: with open(path) as fp: - util.cookiestxt_load(fp, self.cookies) + cookies = util.cookiestxt_load(fp) except Exception as exc: self.log.warning("cookies: %s", exc) else: - self.log.debug("Loading cookies from '%s'", cookies) + self.log.debug("Loading cookies from '%s'", cookies_source) + set_cookie = self.cookies.set_cookie + for cookie in cookies: + set_cookie(cookie) self.cookies_file = path - elif isinstance(cookies, (list, tuple)): - key = tuple(cookies) - cookiejar = _browser_cookies.get(key) + elif isinstance(cookies_source, (list, tuple)): + key = tuple(cookies_source) + cookies = _browser_cookies.get(key) - if cookiejar is None: + if cookies is None: from ..cookies import load_cookies - cookiejar = self.cookies.__class__() try: - load_cookies(cookiejar, cookies) + cookies = load_cookies(cookies_source) except Exception as exc: self.log.warning("cookies: %s", exc) + cookies = () else: - _browser_cookies[key] = cookiejar + _browser_cookies[key] = cookies else: self.log.debug("Using cached cookies from %s", key) set_cookie = self.cookies.set_cookie - for cookie in cookiejar: + for cookie in cookies: set_cookie(cookie) else: self.log.warning( "Expected 'dict', 'list', or 'str' value for 'cookies' " "option, got '%s' (%s)", - cookies.__class__.__name__, cookies) + cookies_source.__class__.__name__, cookies_source) def cookies_store(self): """Store the session's cookies in a cookies.txt file""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 836fae7..693def9 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -401,7 +401,7 @@ class DeviantartExtractor(Extractor): html = content["html"] markup = html["markup"] - if not markup.startswith("{"): + if not markup or markup[0] != "{": return markup if html["type"] == "tiptap": @@ -1301,7 +1301,7 @@ class DeviantartOAuthAPI(): metadata = extractor.config("metadata", False) if not metadata: - metadata = bool(extractor.extra) + metadata = True if extractor.extra else False if metadata: self.metadata = True diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 01af7a4..3e6d537 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -260,9 +260,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "torrentcount" : extr('>Torrent Download (', ')'), } - if data["uploader"].startswith("<"): - data["uploader"] = text.unescape(text.extr( - data["uploader"], ">", "<")) + uploader = data["uploader"] + if uploader and uploader[0] == "<": + data["uploader"] = text.unescape(text.extr(uploader, ">", "<")) f = data["favorites"][0] if f == "N": diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 85dd896..44c4542 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -37,7 +37,7 @@ class FoolfuukaExtractor(BaseExtractor): if not url and "remote_media_link" in media: url = self.remote(media) - if url.startswith("/"): + if url and url[0] == "/": url = self.root + url post["filename"], _, post["extension"] = \ diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index 12e8860..72a6453 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -17,42 +17,30 @@ class LensdumpBase(): category = "lensdump" root = "https://lensdump.com" - def nodes(self, page=None): - if page is None: - page = self.request(self.url).text - - # go through all pages starting from the oldest - page_url = text.urljoin(self.root, text.extr( - text.extr(page, ' id="list-most-oldest-link"', '>'), - 'href="', '"')) - while page_url is not None: - if page_url == self.url: - current_page = page - else: - current_page = self.request(page_url).text - - for node in text.extract_iter( - current_page, ' class="list-item ', '>'): - yield node - - # find url of next page - page_url = text.extr( - text.extr(current_page, ' data-pagination="next"', '>'), - 'href="', '"') - if page_url is not None and len(page_url) > 0: - page_url = text.urljoin(self.root, page_url) - else: - page_url = None + def _pagination(self, page, begin, end): + while True: + yield from text.extract_iter(page, begin, end) + + next = text.extr(page, ' data-pagination="next"', '>') + if not next: + return + + url = text.urljoin(self.root, text.extr(next, 'href="', '"')) + page = self.request(url).text class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): subcategory = "album" - pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" + pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?" example = "https://lensdump.com/a/ID" def __init__(self, match): - GalleryExtractor.__init__(self, match, match.string) - self.gallery_id = match.group(1) or match.group(2) + self.gallery_id, query = match.groups() + if query: + url = "{}/a/{}/?{}".format(self.root, self.gallery_id, query) + else: + url = "{}/a/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) def metadata(self, page): return { @@ -62,40 +50,48 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): } def images(self, page): - for node in self.nodes(page): - # get urls and filenames of images in current page - json_data = util.json_loads(text.unquote( - text.extr(node, "data-object='", "'") or - text.extr(node, 'data-object="', '"'))) - image_id = json_data.get('name') - image_url = json_data.get('url') - image_title = json_data.get('title') + for image in self._pagination(page, ' class="list-item ', '>'): + + data = util.json_loads(text.unquote( + text.extr(image, "data-object='", "'") or + text.extr(image, 'data-object="', '"'))) + image_id = data.get("name") + image_url = data.get("url") + image_title = data.get("title") if image_title is not None: image_title = text.unescape(image_title) + yield (image_url, { - 'id': image_id, - 'url': image_url, - 'title': image_title, - 'name': json_data.get('filename'), - 'filename': image_id, - 'extension': json_data.get('extension'), - 'height': text.parse_int(json_data.get('height')), - 'width': text.parse_int(json_data.get('width')), + "id" : image_id, + "url" : image_url, + "title" : image_title, + "name" : data.get("filename"), + "filename" : image_id, + "extension": data.get("extension"), + "width" : text.parse_int(data.get("width")), + "height" : text.parse_int(data.get("height")), }) class LensdumpAlbumsExtractor(LensdumpBase, Extractor): """Extractor for album list from lensdump.com""" subcategory = "albums" - pattern = BASE_PATTERN + r"/\w+/albums" - example = "https://lensdump.com/USER/albums" + pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?" + example = "https://lensdump.com/USER" def items(self): - for node in self.nodes(): - album_url = text.urljoin(self.root, text.extr( - node, 'data-url-short="', '"')) - yield Message.Queue, album_url, { - "_extractor": LensdumpAlbumExtractor} + user, query = self.groups + url = "{}/{}/".format(self.root, user) + if query: + params = text.parse_query(query) + else: + params = {"sort": "date_asc", "page": "1"} + page = self.request(url, params=params).text + + data = {"_extractor": LensdumpAlbumExtractor} + for album_path in self._pagination(page, 'data-url-short="', '"'): + album_url = text.urljoin(self.root, album_path) + yield Message.Queue, album_url, data class LensdumpImageExtractor(LensdumpBase, Extractor): @@ -107,16 +103,13 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" - def __init__(self, match): - Extractor.__init__(self, match) - self.key = match.group(1) - def items(self): - url = "{}/i/{}".format(self.root, self.key) + key = self.groups[0] + url = "{}/i/{}".format(self.root, key) extr = text.extract_from(self.request(url).text) data = { - "id" : self.key, + "id" : key, "title" : text.unescape(extr( 'property="og:title" content="', '"')), "url" : extr( diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 6fc0689..044f4f5 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -47,7 +47,15 @@ class LolisafeAlbumExtractor(LolisafeExtractor): url = file["file"] file.update(data) text.nameext_from_url(url, file) - file["name"], sep, file["id"] = file["filename"].rpartition("-") + + if "name" in file: + name = file["name"] + file["name"] = name.rpartition(".")[0] or name + file["id"] = file["filename"].rpartition("-")[2] + else: + file["name"], sep, file["id"] = \ + file["filename"].rpartition("-") + yield Message.Url, url, file def fetch_album(self, album_id): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index bca7e4d..1f24593 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -174,6 +174,20 @@ class MangadexListExtractor(MangadexExtractor): yield Message.Queue, url, data +class MangadexAuthorExtractor(MangadexExtractor): + """Extractor for mangadex authors""" + subcategory = "author" + pattern = BASE_PATTERN + r"/author/([0-9a-f-]+)" + example = ("https://mangadex.org/author" + "/01234567-89ab-cdef-0123-456789abcdef/NAME") + + def items(self): + for manga in self.api.manga_author(self.uuid): + manga["_extractor"] = MangadexMangaExtractor + url = "{}/title/{}".format(self.root, manga["id"]) + yield Message.Queue, url, manga + + class MangadexAPI(): """Interface for the MangaDex API v5 @@ -195,6 +209,10 @@ class MangadexAPI(): def athome_server(self, uuid): return self._call("/at-home/server/" + uuid) + def author(self, uuid, manga=False): + params = {"includes[]": ("manga",)} if manga else None + return self._call("/author/" + uuid, params)["data"] + def chapter(self, uuid): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] @@ -210,6 +228,10 @@ class MangadexAPI(): params = {"includes[]": ("artist", "author")} return self._call("/manga/" + uuid, params)["data"] + def manga_author(self, uuid_author): + params = {"authorOrArtist": uuid_author} + return self._pagination("/manga", params) + def manga_feed(self, uuid): order = "desc" if self.extractor.config("chapter-reverse") else "asc" params = { diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py index 0183b25..9fc8681 100644 --- a/gallery_dl/extractor/mangakakalot.py +++ b/gallery_dl/extractor/mangakakalot.py @@ -19,7 +19,7 @@ BASE_PATTERN = r"(?:https?://)?(?:ww[\dw]?\.)?mangakakalot\.tv" class MangakakalotBase(): """Base class for mangakakalot extractors""" category = "mangakakalot" - root = "https://ww6.mangakakalot.tv" + root = "https://ww8.mangakakalot.tv" class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): @@ -40,7 +40,7 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): match = re.match( r"(?:[Vv]ol\. *(\d+) )?" r"[Cc]hapter *([^:]*)" - r"(?:: *(.+))?", info) + r"(?:: *(.+))?", info or "") volume, chapter, title = match.groups() if match else ("", "", info) chapter, sep, minor = chapter.partition(".") @@ -86,7 +86,7 @@ class MangakakalotMangaExtractor(MangakakalotBase, MangaExtractor): data["chapter"] = text.parse_int(chapter) data["chapter_minor"] = sep + minor - if url.startswith("/"): + if url[0] == "/": url = self.root + url results.append((url, data.copy())) return results diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 2928573..61ffdee 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -14,6 +14,9 @@ from ..cache import cache import itertools import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com" +USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com" + class NewgroundsExtractor(Extractor): """Base class for newgrounds extractors""" @@ -93,7 +96,7 @@ class NewgroundsExtractor(Extractor): def posts(self): """Return URLs of all relevant post pages""" - return self._pagination(self._path) + return self._pagination(self._path, self.groups[1]) def metadata(self): """Return general metadata""" @@ -334,10 +337,10 @@ class NewgroundsExtractor(Extractor): for fmt in formats: yield fmt[1][0]["src"] - def _pagination(self, kind): + def _pagination(self, kind, pnum=1): url = "{}/{}".format(self.user_root, kind) params = { - "page": 1, + "page": text.parse_int(pnum, 1), "isAjaxRequest": "1", } headers = { @@ -400,8 +403,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): class NewgroundsMediaExtractor(NewgroundsExtractor): """Extractor for a media file from newgrounds.com""" subcategory = "media" - pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" - r"(/(?:portal/view|audio/listen)/\d+)") + pattern = BASE_PATTERN + r"(/(?:portal/view|audio/listen)/\d+)" example = "https://www.newgrounds.com/portal/view/12345" def __init__(self, match): @@ -416,35 +418,35 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): class NewgroundsArtExtractor(NewgroundsExtractor): """Extractor for all images of a newgrounds user""" subcategory = _path = "art" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/art/?$" + pattern = USER_PATTERN + r"/art(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/art" class NewgroundsAudioExtractor(NewgroundsExtractor): """Extractor for all audio submissions of a newgrounds user""" subcategory = _path = "audio" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/audio/?$" + pattern = USER_PATTERN + r"/audio(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/audio" class NewgroundsMoviesExtractor(NewgroundsExtractor): """Extractor for all movies of a newgrounds user""" subcategory = _path = "movies" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/movies/?$" + pattern = USER_PATTERN + r"/movies(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/movies" class NewgroundsGamesExtractor(NewgroundsExtractor): """Extractor for a newgrounds user's games""" subcategory = _path = "games" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$" + pattern = USER_PATTERN + r"/games(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/games" class NewgroundsUserExtractor(NewgroundsExtractor): """Extractor for a newgrounds user profile""" subcategory = "user" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/?$" + pattern = USER_PATTERN + r"/?$" example = "https://USER.newgrounds.com" def initialize(self): @@ -464,25 +466,22 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): """Extractor for posts favorited by a newgrounds user""" subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") - pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com" - r"/favorites(?!/following)(?:/(art|audio|movies))?/?") + pattern = (USER_PATTERN + r"/favorites(?!/following)(?:/(art|audio|movies)" + r"(?:(?:/page/|/?\?page=)(\d+))?)?") example = "https://USER.newgrounds.com/favorites" - def __init__(self, match): - NewgroundsExtractor.__init__(self, match) - self.kind = match.group(2) - def posts(self): - if self.kind: - return self._pagination(self.kind) + _, kind, pnum = self.groups + if kind: + return self._pagination_favorites(kind, pnum) return itertools.chain.from_iterable( - self._pagination(k) for k in ("art", "audio", "movies") + self._pagination_favorites(k) for k in ("art", "audio", "movies") ) - def _pagination(self, kind): + def _pagination_favorites(self, kind, pnum=1): url = "{}/favorites/{}".format(self.user_root, kind) params = { - "page": 1, + "page": text.parse_int(pnum, 1), "isAjaxRequest": "1", } headers = { @@ -514,12 +513,13 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): """Extractor for a newgrounds user's favorited users""" subcategory = "following" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)" + pattern = USER_PATTERN + r"/favorites/(following)" example = "https://USER.newgrounds.com/favorites/following" def items(self): + _, kind, pnum = self.groups data = {"_extractor": NewgroundsUserExtractor} - for url in self._pagination(self.kind): + for url in self._pagination_favorites(kind, pnum): yield Message.Queue, url, data @staticmethod @@ -534,13 +534,12 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): """Extractor for newgrounds.com search reesults""" subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") - pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" - r"/search/conduct/([^/?#]+)/?\?([^#]+)") + pattern = BASE_PATTERN + r"/search/conduct/([^/?#]+)/?\?([^#]+)" example = "https://www.newgrounds.com/search/conduct/art?terms=QUERY" def __init__(self, match): NewgroundsExtractor.__init__(self, match) - self._path, query = match.groups() + self._path, query = self.groups self.query = text.parse_query(query) def posts(self): @@ -550,19 +549,20 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): for s in suitabilities.split(",")} self.request(self.root + "/suitabilities", method="POST", data=data) - return self._pagination("/search/conduct/" + self._path, self.query) + return self._pagination_search( + "/search/conduct/" + self._path, self.query) def metadata(self): return {"search_tags": self.query.get("terms", "")} - def _pagination(self, path, params): + def _pagination_search(self, path, params): url = self.root + path + params["inner"] = "1" + params["page"] = text.parse_int(params.get("page"), 1) headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", } - params["inner"] = "1" - params["page"] = 1 while True: data = self.request(url, params=params, headers=headers).json() diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 8c7ffe5..851f663 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -63,7 +63,8 @@ class NozomiExtractor(Extractor): yield Message.Directory, post for post["num"], image in enumerate(images, 1): post["filename"] = post["dataid"] = did = image["dataid"] - post["is_video"] = video = bool(image.get("is_video")) + post["is_video"] = video = \ + True if image.get("is_video") else False ext = image["type"] if video: diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index d47ffa2..0b64ea3 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -56,6 +56,7 @@ class PatreonExtractor(Extractor): text.nameext_from_url(name, post) if text.ext_from_url(url) == "m3u8": url = "ytdl:" + url + post["_ytdl_manifest"] = "hls" post["extension"] = "mp4" yield Message.Url, url, post else: @@ -310,7 +311,7 @@ class PatreonCreatorExtractor(PatreonExtractor): subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" - r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") + r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") example = "https://www.patreon.com/USER" def posts(self): @@ -340,9 +341,9 @@ class PatreonCreatorExtractor(PatreonExtractor): user_id = query.get("u") if user_id: - url = "{}/user/posts?u={}".format(self.root, user_id) + url = "{}/user?u={}".format(self.root, user_id) else: - url = "{}/{}/posts".format(self.root, creator) + url = "{}/{}".format(self.root, creator) page = self.request(url, notfound="creator").text try: diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 8c04ed5..499c579 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -18,8 +18,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" class PinterestExtractor(Extractor): """Base class for pinterest extractors""" category = "pinterest" - filename_fmt = "{category}_{id}{media_id:?_//}.{extension}" - archive_fmt = "{id}{media_id}" + filename_fmt = "{category}_{id}{media_id|page_id:?_//}.{extension}" + archive_fmt = "{id}{media_id|page_id}" root = "https://www.pinterest.com" def _init(self): @@ -30,12 +30,12 @@ class PinterestExtractor(Extractor): self.root = text.ensure_http_scheme(domain) self.api = PinterestAPI(self) + self.stories = self.config("stories", True) + self.videos = self.config("videos", True) def items(self): data = self.metadata() - videos = self.config("videos", True) - yield Message.Directory, data for pin in self.pins(): if isinstance(pin, tuple): @@ -43,40 +43,35 @@ class PinterestExtractor(Extractor): yield Message.Queue, url, data continue + try: + files = self._extract_files(pin) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Error when extracting download URLs (%s: %s)", + pin.get("id"), exc.__class__.__name__, exc) + continue + pin.update(data) + pin["count"] = len(files) - carousel_data = pin.get("carousel_data") - if carousel_data: - pin["count"] = len(carousel_data["carousel_slots"]) - for num, slot in enumerate(carousel_data["carousel_slots"], 1): - slot["media_id"] = slot.pop("id") - pin.update(slot) - pin["num"] = num - size, image = next(iter(slot["images"].items())) - url = image["url"].replace("/" + size + "/", "/originals/") - yield Message.Url, url, text.nameext_from_url(url, pin) - - else: - try: - media = self._media_from_pin(pin) - except Exception: - self.log.debug("Unable to fetch download URL for pin %s", - pin.get("id")) - continue + yield Message.Directory, pin + for pin["num"], file in enumerate(files, 1): + url = file["url"] + text.nameext_from_url(url, pin) + pin.update(file) - if videos or media.get("duration") is None: - pin.update(media) - pin["num"] = pin["count"] = 1 + if "media_id" not in file: pin["media_id"] = "" + if "page_id" not in file: + pin["page_id"] = "" - url = media["url"] - text.nameext_from_url(url, pin) + if pin["extension"] == "m3u8": + url = "ytdl:" + url + pin["_ytdl_manifest"] = "hls" + pin["extension"] = "mp4" - if pin["extension"] == "m3u8": - url = "ytdl:" + url - pin["extension"] = "mp4" - - yield Message.Url, url, pin + yield Message.Url, url, pin def metadata(self): """Return general metadata""" @@ -84,26 +79,108 @@ class PinterestExtractor(Extractor): def pins(self): """Return all relevant pin objects""" - @staticmethod - def _media_from_pin(pin): + def _extract_files(self, pin): + story_pin_data = pin.get("story_pin_data") + if story_pin_data and self.stories: + return self._extract_story(pin, story_pin_data) + + carousel_data = pin.get("carousel_data") + if carousel_data: + return self._extract_carousel(pin, carousel_data) + videos = pin.get("videos") - if videos: - video_formats = videos["video_list"] + if videos and self.videos: + return (self._extract_video(videos),) - for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): - if fmt in video_formats: - media = video_formats[fmt] - break - else: - media = max(video_formats.values(), - key=lambda x: x.get("width", 0)) + try: + return (pin["images"]["orig"],) + except Exception: + self.log.debug("%s: No files found", pin.get("id")) + return () + + def _extract_story(self, pin, story): + files = [] + story_id = story.get("id") + + for page in story["pages"]: + page_id = page.get("id") + + for block in page["blocks"]: + type = block.get("type") + + if type == "story_pin_image_block": + if 1 == len(page["blocks"]) == len(story["pages"]): + try: + media = pin["images"]["orig"] + except Exception: + media = self._extract_image(page, block) + else: + media = self._extract_image(page, block) + + elif type == "story_pin_video_block": + video = block["video"] + media = self._extract_video(video) + media["media_id"] = video.get("id") or "" + + elif type == "story_pin_paragraph_block": + media = {"url": "text:" + block["text"], + "extension": "txt", + "media_id": block.get("id")} + + else: + self.log.warning("%s: Unsupported story block '%s'", + pin.get("id"), type) + continue - if "V_720P" in video_formats: - media["_fallback"] = (video_formats["V_720P"]["url"],) + media["story_id"] = story_id + media["page_id"] = page_id + files.append(media) + + return files + + def _extract_carousel(self, pin, carousel_data): + files = [] + for slot in carousel_data["carousel_slots"]: + size, image = next(iter(slot["images"].items())) + slot["media_id"] = slot.pop("id") + slot["url"] = image["url"].replace( + "/" + size + "/", "/originals/", 1) + files.append(slot) + return files + + def _extract_image(self, page, block): + sig = block.get("image_signature") or page["image_signature"] + url_base = "https://i.pinimg.com/originals/{}/{}/{}/{}.".format( + sig[0:2], sig[2:4], sig[4:6], sig) + url_jpg = url_base + "jpg" + url_png = url_base + "png" + url_webp = url_base + "webp" - return media + try: + media = block["image"]["images"]["originals"] + except Exception: + media = {"url": url_jpg, "_fallback": (url_png, url_webp,)} - return pin["images"]["orig"] + if media["url"] == url_jpg: + media["_fallback"] = (url_png, url_webp,) + else: + media["_fallback"] = (url_jpg, url_png, url_webp,) + media["media_id"] = sig + + return media + + def _extract_video(self, video): + video_formats = video["video_list"] + for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): + if fmt in video_formats: + media = video_formats[fmt] + break + else: + media = max(video_formats.values(), + key=lambda x: x.get("width", 0)) + if "V_720P" in video_formats: + media["_fallback"] = (video_formats["V_720P"]["url"],) + return media class PinterestPinExtractor(PinterestExtractor): diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index c2d1243..8c6e6d8 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -38,6 +38,7 @@ class PixivExtractor(Extractor): self.meta_user = self.config("metadata") self.meta_bookmark = self.config("metadata-bookmark") self.meta_comments = self.config("comments") + self.meta_captions = self.config("captions") def items(self): tags = self.config("tags", "japanese") @@ -76,8 +77,8 @@ class PixivExtractor(Extractor): detail = self.api.illust_bookmark_detail(work["id"]) work["tags_bookmark"] = [tag["name"] for tag in detail["tags"] if tag["is_registered"]] - if self.sanity_workaround and not work.get("caption") and \ - not work.get("_mypixiv"): + if self.meta_captions and not work.get("caption") and \ + not work.get("_mypixiv") and not work.get("_ajax"): body = self._request_ajax("/illust/" + str(work["id"])) if body: work["caption"] = text.unescape(body["illustComment"]) @@ -108,10 +109,10 @@ class PixivExtractor(Extractor): if self.load_ugoira: try: return self._extract_ugoira(work) - except exception.StopExtraction as exc: + except Exception as exc: self.log.warning( - "Unable to retrieve Ugoira metatdata (%s - %s)", - work["id"], exc.message) + "%s: Unable to retrieve Ugoira metatdata (%s - %s)", + work["id"], exc.__class__.__name__, exc) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] @@ -186,6 +187,7 @@ class PixivExtractor(Extractor): return None def _extract_ajax(self, work, body): + work["_ajax"] = True url = self._extract_ajax_url(body) if not url: return () @@ -243,12 +245,12 @@ class PixivExtractor(Extractor): original = body["urls"]["original"] if original: return original - except KeyError: + except Exception: pass try: square1200 = body["userIllusts"][body["id"]]["url"] - except KeyError: + except Exception: return parts = square1200.rpartition("_p0")[0].split("/") del parts[3:5] @@ -293,9 +295,6 @@ class PixivExtractor(Extractor): "x_restrict" : 0, } - def _web_to_mobile(self, work): - return work - def works(self): """Return an iterable containing all relevant 'work' objects""" @@ -334,15 +333,17 @@ class PixivUserExtractor(PixivExtractor): class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" + _warning = True pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") example = "https://www.pixiv.net/en/users/12345/artworks" - def __init__(self, match): - PixivExtractor.__init__(self, match) - u1, t1, u2, t2 = match.groups() + def _init(self): + PixivExtractor._init(self) + + u1, t1, u2, t2 = self.groups if t1: t1 = text.unquote(t1) elif t2: @@ -350,6 +351,14 @@ class PixivArtworksExtractor(PixivExtractor): self.user_id = u1 or u2 self.tag = t1 or t2 + if self.sanity_workaround: + self.cookies_domain = d = ".pixiv.net" + self._init_cookies() + if self._warning and not self.cookies.get("PHPSESSID", domain=d): + PixivArtworksExtractor._warning = False + self.log.warning("No 'PHPSESSID' cookie set. Can detect only " + "non R-18 'sanity_level' works.") + def metadata(self): if self.config("metadata"): self.api.user_detail(self.user_id) @@ -358,6 +367,19 @@ class PixivArtworksExtractor(PixivExtractor): def works(self): works = self.api.user_illusts(self.user_id) + if self.sanity_workaround: + body = self._request_ajax( + "/user/{}/profile/all".format(self.user_id)) + try: + ajax_ids = list(map(int, body["illusts"])) + ajax_ids.extend(map(int, body["manga"])) + ajax_ids.sort() + except Exception as exc: + self.log.warning("Unable to collect artwork IDs using AJAX " + "API (%s: %s)", exc.__class__.__name__, exc) + else: + works = self._extend_sanity(works, ajax_ids) + if self.tag: tag = self.tag.lower() works = ( @@ -367,6 +389,35 @@ class PixivArtworksExtractor(PixivExtractor): return works + def _extend_sanity(self, works, ajax_ids): + user = {"id": 1} + index = len(ajax_ids) - 1 + + for work in works: + while index >= 0: + work_id = work["id"] + ajax_id = ajax_ids[index] + + if ajax_id == work_id: + index -= 1 + break + + elif ajax_id > work_id: + index -= 1 + self.log.debug("Inserting work %s", ajax_id) + yield self._make_work(ajax_id, self.sanity_url, user) + + else: # ajax_id < work_id + break + + yield work + + while index >= 0: + ajax_id = ajax_ids[index] + self.log.debug("Inserting work %s", ajax_id) + yield self._make_work(ajax_id, self.sanity_url, user) + index -= 1 + class PixivAvatarExtractor(PixivExtractor): """Extractor for pixiv avatars""" diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py index 29b351b..8877175 100644 --- a/gallery_dl/extractor/postmill.py +++ b/gallery_dl/extractor/postmill.py @@ -50,7 +50,7 @@ class PostmillExtractor(BaseExtractor): forum = match.group(1) id = int(match.group(2)) - is_text_post = url.startswith("/") + is_text_post = (url[0] == "/") is_image_post = self._search_image_tag(page) is not None data = { "title": title, diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index ce602f6..8577e74 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -31,6 +31,7 @@ class RedditExtractor(Extractor): parentdir = self.config("parent-directory") max_depth = self.config("recursion", 0) previews = self.config("previews", True) + embeds = self.config("embeds", True) videos = self.config("videos", True) if videos: @@ -100,7 +101,7 @@ class RedditExtractor(Extractor): for comment in comments: html = comment["body_html"] or "" href = (' href="' in html) - media = ("media_metadata" in comment) + media = (embeds and "media_metadata" in comment) if media or href: comment["date"] = text.parse_timestamp( @@ -211,8 +212,9 @@ class RedditExtractor(Extractor): def _extract_video_dash(self, submission): submission["_ytdl_extra"] = {"title": submission["title"]} try: - return (submission["secure_media"]["reddit_video"]["dash_url"] + - "#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D") + url = submission["secure_media"]["reddit_video"]["dash_url"] + submission["_ytdl_manifest"] = "dash" + return url except Exception: return submission["url"] diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py new file mode 100644 index 0000000..9f9f0c4 --- /dev/null +++ b/gallery_dl/extractor/scrolller.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://scrolller.com/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?scrolller\.com" + + +class ScrolllerExtractor(Extractor): + """Base class for scrolller extractors""" + category = "scrolller" + root = "https://scrolller.com" + directory_fmt = ("{category}", "{subredditTitle}") + filename_fmt = "{id}{title:? //}.{extension}" + archive_fmt = "{id}" + request_interval = (0.5, 1.5) + + def _init(self): + self.auth_token = None + + def items(self): + self.login() + + for post in self.posts(): + + src = max(post["mediaSources"], key=self._sort_key) + post.update(src) + url = src["url"] + text.nameext_from_url(url, post) + + yield Message.Directory, post + yield Message.Url, url, post + + def posts(self): + return () + + def login(self): + username, password = self._get_auth_info() + if username: + self.auth_token = self._login_impl(username, password) + + @cache(maxage=28*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + variables = { + "username": username, + "password": password, + } + + try: + data = self._request_graphql("LoginQuery", variables) + except exception.HttpError as exc: + if exc.status == 403: + raise exception.AuthenticationError() + raise + + return data["login"]["token"] + + def _request_graphql(self, opname, variables): + url = "https://api.scrolller.com/api/v2/graphql" + headers = { + "Content-Type" : "text/plain;charset=UTF-8", + "Origin" : self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + data = { + "query" : QUERIES[opname], + "variables" : variables, + "authorization": self.auth_token, + } + return self.request( + url, method="POST", headers=headers, data=util.json_dumps(data), + ).json()["data"] + + def _pagination(self, opname, variables): + while True: + data = self._request_graphql(opname, variables) + + while "items" not in data: + data = data.popitem()[1] + yield from data["items"] + + if not data["iterator"]: + return + variables["iterator"] = data["iterator"] + + def _sort_key(self, src): + return src["width"], not src["isOptimized"] + + +class ScrolllerSubredditExtractor(ScrolllerExtractor): + """Extractor for media from a scrolller subreddit""" + subcategory = "subreddit" + pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?" + example = "https://scrolller.com/r/SUBREDDIT" + + def posts(self): + url, query = self.groups + filter = None + + if query: + params = text.parse_query(query) + if "filter" in params: + filter = params["filter"].upper().rstrip("S") + + variables = { + "url" : url, + "iterator" : None, + "filter" : filter, + "hostsDown": None, + } + return self._pagination("SubredditQuery", variables) + + +class ScrolllerFollowingExtractor(ScrolllerExtractor): + """Extractor for followed scrolller subreddits""" + subcategory = "following" + pattern = BASE_PATTERN + r"/following" + example = "https://scrolller.com/following" + + def items(self): + self.login() + + if not self.auth_token: + raise exception.AuthorizationError("Login required") + + variables = { + "iterator" : None, + "hostsDown": None, + } + + for subreddit in self._pagination("FollowingQuery", variables): + url = self.root + subreddit["url"] + subreddit["_extractor"] = ScrolllerSubredditExtractor + yield Message.Queue, url, subreddit + + +class ScrolllerPostExtractor(ScrolllerExtractor): + """Extractor for media from a single scrolller post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)" + example = "https://scrolller.com/title-slug-a1b2c3d4f5" + + def posts(self): + url = "{}/{}".format(self.root, self.groups[0]) + page = self.request(url).text + data = util.json_loads(text.extr( + page, '<script>window.scrolllerConfig="', '"</script>') + .replace('\\"', '"')) + return (data["item"],) + + +QUERIES = { + + "SubredditQuery": """\ +query SubredditQuery( + $url: String! + $filter: SubredditPostFilter + $iterator: String +) { + getSubreddit( + url: $url + ) { + children( + limit: 50 + iterator: $iterator + filter: $filter + disabledHosts: null + ) { + iterator items { + __typename id url title subredditId subredditTitle + subredditUrl redditPath isNsfw albumUrl hasAudio + fullLengthSource gfycatSource redgifsSource ownerAvatar + username displayName isPaid tags isFavorite + mediaSources { url width height isOptimized } + blurredMediaSources { url width height isOptimized } + } + } + } +} +""", + + "FollowingQuery": """\ +query FollowingQuery( + $iterator: String +) { + getFollowing( + limit: 10 + iterator: $iterator + ) { + iterator items { + __typename id url title secondaryTitle description createdAt isNsfw + subscribers isComplete itemCount videoCount pictureCount albumCount + isPaid username tags isFollowing + banner { url width height isOptimized } + } + } +} +""", + + "LoginQuery": """\ +query LoginQuery( + $username: String!, + $password: String! +) { + login( + username: $username, + password: $password + ) { + username token expiresAt isAdmin status isPremium + } +} +""", + +} diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py index dd5988f..468840b 100644 --- a/gallery_dl/extractor/telegraph.py +++ b/gallery_dl/extractor/telegraph.py @@ -49,7 +49,7 @@ class TelegraphGalleryExtractor(GalleryExtractor): url, pos = text.extract(figure, 'src="', '"') if url.startswith("/embed/"): continue - elif url.startswith("/"): + elif url[0] == "/": url = self.root + url caption, pos = text.extract(figure, "<figcaption>", "<", pos) num += 1 diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index bce661a..b196aeb 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -148,8 +148,10 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): data["PageNumber"] += 1 def _parse(self, query): + if not query: + return {} try: - if query.startswith("?"): + if query[0] == "?": return self._parse_simple(query) return self._parse_jsurl(query) except Exception as exc: @@ -187,8 +189,6 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill)) Ref: https://github.com/Sage/jsurl """ - if not data: - return {} i = 0 imax = len(data) diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index b21709a..f7ce44b 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -7,7 +7,7 @@ """Extractors for https://urlgalleries.net/""" from .common import GalleryExtractor, Message -from .. import text +from .. import text, exception class UrlgalleriesGalleryExtractor(GalleryExtractor): @@ -16,27 +16,31 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): root = "urlgalleries.net" request_interval = (0.5, 1.0) pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" - example = "https://blog.urlgalleries.net/gallery-12345/TITLE" + example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE" - def __init__(self, match): - self.blog, self.gallery_id = match.groups() + def items(self): + blog, self.gallery_id = self.groups url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( - self.blog, self.gallery_id) - GalleryExtractor.__init__(self, match, url) + blog, self.gallery_id) + + with self.request(url, allow_redirects=False, fatal=...) as response: + if 300 <= response.status_code < 500: + if response.headers.get("location", "").endswith( + "/not_found_adult.php"): + raise exception.NotFoundError("gallery") + raise exception.HttpError(None, response) + page = response.text - def items(self): - page = self.request(self.gallery_url).text imgs = self.images(page) data = self.metadata(page) data["count"] = len(imgs) - del page - root = "https://{}.urlgalleries.net".format(self.blog) + root = "https://{}.urlgalleries.net".format(blog) yield Message.Directory, data for data["num"], img in enumerate(imgs, 1): - response = self.request( - root + img, method="HEAD", allow_redirects=False) - yield Message.Queue, response.headers["Location"], data + page = self.request(root + img).text + url = text.extr(page, "window.location.href = '", "'") + yield Message.Queue, url, data def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 95eeafe..ea034a7 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -24,6 +24,13 @@ class VkExtractor(Extractor): root = "https://vk.com" request_interval = (0.5, 1.5) + def _init(self): + self.offset = text.parse_int(self.config("offset")) + + def skip(self, num): + self.offset += num + return num + def items(self): sub = re.compile(r"/imp[fg]/").sub sizes = "wzyxrqpo" @@ -75,7 +82,7 @@ class VkExtractor(Extractor): "al" : "1", "direction": "1", "list" : photos_id, - "offset" : 0, + "offset" : self.offset, } while True: diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 116f557..4eae537 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -193,7 +193,10 @@ class WikimediaArticleExtractor(WikimediaExtractor): def __init__(self, match): WikimediaExtractor.__init__(self, match) - path = match.group(match.lastindex) + path = self.groups[-1] + if path[2] == "/": + self.root = self.root + "/" + path[:2] + path = path[3:] if path.startswith("wiki/"): path = path[5:] |
