From 0532a387ef5b7fcb4507a9b094dca37a5f635fe1 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 12 Jan 2025 21:27:05 -0500 Subject: New upstream version 1.28.4. --- gallery_dl/extractor/__init__.py | 2 + gallery_dl/extractor/bbc.py | 3 +- gallery_dl/extractor/bunkr.py | 36 ++++--- gallery_dl/extractor/cien.py | 7 +- gallery_dl/extractor/common.py | 14 ++- gallery_dl/extractor/e621.py | 2 +- gallery_dl/extractor/imagefap.py | 6 +- gallery_dl/extractor/mangapark.py | 3 +- gallery_dl/extractor/patreon.py | 15 ++- gallery_dl/extractor/pexels.py | 189 ++++++++++++++++++++++++++++++++++++ gallery_dl/extractor/pixiv.py | 21 +++- gallery_dl/extractor/plurk.py | 16 +-- gallery_dl/extractor/slideshare.py | 5 +- gallery_dl/extractor/wallhaven.py | 23 ++++- gallery_dl/extractor/weebcentral.py | 136 ++++++++++++++++++++++++++ gallery_dl/option.py | 8 +- gallery_dl/util.py | 2 +- gallery_dl/version.py | 2 +- 18 files changed, 429 insertions(+), 61 deletions(-) create mode 100644 gallery_dl/extractor/pexels.py create mode 100644 gallery_dl/extractor/weebcentral.py (limited to 'gallery_dl') diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d003a61..b582c99 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -124,6 +124,7 @@ modules = [ "nsfwalbum", "paheal", "patreon", + "pexels", "philomena", "photovogue", "picarto", @@ -190,6 +191,7 @@ modules = [ "weasyl", "webmshare", "webtoons", + "weebcentral", "weibo", "wikiart", "wikifeet", diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 54aaac4..113a669 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -26,8 +26,7 @@ class BbcGalleryExtractor(GalleryExtractor): example = "https://www.bbc.co.uk/programmes/PATH" def metadata(self, page): - data = util.json_loads(text.extr( - page, '')) + data = self._extract_jsonld(page) return { "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 3e12452..e1ee50d 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -80,6 +80,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): # redirect url = response.headers["Location"] + if url[0] == "/": + url = self.root + url + continue root, path = self._split(url) if root not in CF_DOMAINS: continue @@ -105,37 +108,40 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "All Bunkr domains require solving a CF challenge") # select alternative domain - root = "https://" + random.choice(DOMAINS) + self.root = root = "https://" + random.choice(DOMAINS) self.log.debug("Trying '%s' as fallback", root) url = root + path def fetch_album(self, album_id): # album metadata - page = self.request(self.root + "/a/" + album_id).text - title, size = text.split_html(text.extr( - page, "").partition(">")[2]) - if "&" in title: - title = title.replace( - "<", "<").replace(">", ">").replace("&", "&") + page = self.request( + self.root + "/a/" + album_id, encoding="utf-8").text + title = text.unescape(text.unescape(text.extr( + page, 'property="og:title" content="', '"'))) # files - items = list(text.extract_iter(page, "", "")) + items = list(text.extract_iter( + page, '
")) + return self._extract_files(items), { "album_id" : album_id, "album_name" : title, - "album_size" : text.extr(size, "(", ")"), + "album_size" : text.extr( + page, '(', ')'), "count" : len(items), } def _extract_files(self, items): for item in items: try: - url = text.extr(item, ' href="', '"') - file = self._extract_file(text.unescape(url)) + url = text.unescape(text.extr(item, ' href="', '"')) + if url[0] == "/": + url = self.root + url + file = self._extract_file(url) info = text.split_html(item) - file["name"] = info[0] - file["size"] = info[2] + file["name"] = info[-3] + file["size"] = info[-2] file["date"] = text.parse_datetime( info[-1], "%H:%M:%S %d/%m/%Y") @@ -179,8 +185,8 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.si media links""" subcategory = "media" directory_fmt = ("{category}",) - pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)" - example = "https://bunkr.si/v/FILENAME" + pattern = BASE_PATTERN + r"(/[fvid]/[^/?#]+)" + example = "https://bunkr.si/f/FILENAME" def fetch_album(self, album_id): try: diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py index 378365e..27d50e7 100644 --- a/gallery_dl/extractor/cien.py +++ b/gallery_dl/extractor/cien.py @@ -9,7 +9,7 @@ """Extractors for https://ci-en.net/""" from .common import Extractor, Message -from .. import text, util +from .. import text BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" @@ -56,11 +56,8 @@ class CienArticleExtractor(CienExtractor): self.root, self.groups[0], self.groups[1]) page = self.request(url, notfound="article").text - post = util.json_loads(text.extr( - page, ''))[0] - files = self._extract_files(page) - + post = self._extract_jsonld(page)[0] post["post_url"] = url post["post_id"] = text.parse_int(self.groups[1]) post["count"] = len(files) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5ada030..13fd88a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -587,6 +587,14 @@ class Extractor(): return True return False + def _extract_jsonld(self, page): + return util.json_loads(text.extr( + page, '")) + + def _extract_nextdata(self, page): + return util.json_loads(text.extr( + page, ' id="__NEXT_DATA__" type="application/json">', "")) + def _prepare_ddosguard_cookies(self): if not self.cookies.get("__ddg2", domain=self.cookies_domain): self.cookies.set( @@ -772,7 +780,11 @@ class MangaExtractor(Extractor): def items(self): self.login() - page = self.request(self.manga_url).text + + if self.manga_url: + page = self.request(self.manga_url, notfound=self.subcategory).text + else: + page = None chapters = self.chapters(page) if self.reverse: diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 4a6624d..33e6ba8 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -90,7 +90,7 @@ BASE_PATTERN = E621Extractor.update({ class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor): """Extractor for e621 posts from tag searches""" - pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)" + pattern = BASE_PATTERN + r"/posts?(?:\?[^#]*?tags=|/index/\d+/)([^&#]*)" example = "https://e621.net/posts?tags=TAG" diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 28590fc..dd5220d 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -9,7 +9,7 @@ """Extractors for https://www.imagefap.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, exception BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com" @@ -129,13 +129,11 @@ class ImagefapImageExtractor(ImagefapExtractor): url, pos = text.extract( page, 'original="', '"') - info, pos = text.extract( - page, '', pos) image_id, pos = text.extract( page, 'id="imageid_input" value="', '"', pos) gallery_id, pos = text.extract( page, 'id="galleryid_input" value="', '"', pos) - info = util.json_loads(info) + info = self._extract_jsonld(page) return url, text.nameext_from_url(url, { "title": text.unescape(info["name"]), diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 63aaf91..6f7a238 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -43,8 +43,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): ChapterExtractor.__init__(self, match, url) def metadata(self, page): - data = util.json_loads(text.extr( - page, 'id="__NEXT_DATA__" type="application/json">', '<')) + data = self._extract_nextdata(page) chapter = (data["props"]["pageProps"]["dehydratedState"] ["queries"][0]["state"]["data"]["data"]) manga = chapter["comicNode"]["data"] diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index e4a5985..866e93a 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -286,15 +286,12 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - data = text.extr( - page, 'id="__NEXT_DATA__" type="application/json">', '= pagination["total_pages"]: + return + params["page"] = pagination["current_page"] + 1 diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 6207bf7..d3e40ee 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -111,6 +111,7 @@ class PixivExtractor(Extractor): { "url" : img["image_urls"]["original"], "suffix": "_p{:02}".format(num), + "_fallback": self._fallback_image(img), } for num, img in enumerate(meta_pages) ] @@ -128,7 +129,7 @@ class PixivExtractor(Extractor): self.log.warning("%s: 'My pixiv' locked", work["id"]) elif work["type"] != "ugoira": - return ({"url": url},) + return ({"url": url, "_fallback": self._fallback_image(url)},) elif self.load_ugoira: try: @@ -269,6 +270,24 @@ class PixivExtractor(Extractor): except exception.HttpError: pass + def _fallback_image(self, src): + if isinstance(src, str): + urls = None + orig = src + else: + urls = src["image_urls"] + orig = urls["original"] + + base = orig.rpartition(".")[0] + yield base.replace("-original/", "-master/", 1) + "_master1200.jpg" + + if urls is None: + return + + for fmt in ("large", "medium", "square_medium"): + if fmt in urls: + yield urls[fmt] + @staticmethod def _date_from_url(url, offset=timedelta(hours=9)): try: diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index be0dbde..0bacd54 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -104,16 +104,16 @@ class PlurkPostExtractor(PlurkExtractor): pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)" example = "https://www.plurk.com/p/12345" - def __init__(self, match): - PlurkExtractor.__init__(self, match) - self.plurk_id = match.group(1) - def plurks(self): - url = "{}/p/{}".format(self.root, self.plurk_id) + url = "{}/p/{}".format(self.root, self.groups[0]) page = self.request(url).text - user, pos = text.extract(page, " GLOBAL = ", "\n") - data, pos = text.extract(page, "plurk = ", ";\n", pos) + user, pos = text.extract(page, " GLOBAL=", "\n") + data, pos = text.extract(page, "plurk =", ";\n", pos) data = self._load(data) - data["user"] = self._load(user)["page_user"] + try: + data["user"] = self._load(user)["page_user"] + except Exception: + self.log.warning("%s: Failed to extract 'user' data", + self.groups[0]) return (data,) diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index e5e7a6b..0722d23 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -10,7 +10,7 @@ """Extractors for https://www.slideshare.net/""" from .common import GalleryExtractor -from .. import text, util +from .. import text class SlidesharePresentationExtractor(GalleryExtractor): @@ -31,8 +31,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - data = util.json_loads(text.extr( - page, 'id="__NEXT_DATA__" type="application/json">', '')) + data = self._extract_nextdata(page) self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"] return { diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 479e8a8..e5b764a 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -54,7 +54,7 @@ class WallhavenExtractor(Extractor): class WallhavenSearchExtractor(WallhavenExtractor): """Extractor for search results on wallhaven.cc""" subcategory = "search" - directory_fmt = ("{category}", "{search[q]}") + directory_fmt = ("{category}", "{search[tags]}") archive_fmt = "s_{search[q]}_{id}" pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^#]+))?" example = "https://wallhaven.cc/search?q=QUERY" @@ -64,7 +64,7 @@ class WallhavenSearchExtractor(WallhavenExtractor): self.params = text.parse_query(match.group(1)) def wallpapers(self): - return self.api.search(self.params.copy()) + return self.api.search(self.params) def metadata(self): return {"search": self.params} @@ -141,7 +141,7 @@ class WallhavenUploadsExtractor(WallhavenExtractor): def wallpapers(self): params = {"q": "@" + self.username} - return self.api.search(params.copy()) + return self.api.search(params) def metadata(self): return {"username": self.username} @@ -215,20 +215,35 @@ class WallhavenAPI(): def _pagination(self, endpoint, params=None, metadata=None): if params is None: + params_ptr = None params = {} + else: + params_ptr = params + params = params.copy() if metadata is None: metadata = self.extractor.config("metadata") while True: data = self._call(endpoint, params) + meta = data.get("meta") + if params_ptr is not None: + if meta and "query" in meta: + query = meta["query"] + if isinstance(query, dict): + params_ptr["tags"] = query.get("tag") + params_ptr["tag_id"] = query.get("id") + else: + params_ptr["tags"] = query + params_ptr["tag_id"] = 0 + params_ptr = None + if metadata: for wp in data["data"]: yield self.info(str(wp["id"])) else: yield from data["data"] - meta = data.get("meta") if not meta or meta["current_page"] >= meta["last_page"]: return params["page"] = meta["current_page"] + 1 diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py new file mode 100644 index 0000000..39f998a --- /dev/null +++ b/gallery_dl/extractor/weebcentral.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://weebcentral.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?weebcentral\.com" + + +class WeebcentralBase(): + category = "weebcentral" + root = "https://weebcentral.com" + request_interval = (0.5, 1.5) + + @memcache(keyarg=1) + def _extract_manga_data(self, manga_id): + url = "{}/series/{}".format(self.root, manga_id) + page = self.request(url).text + extr = text.extract_from(page) + + return { + "manga_id": manga_id, + "lang" : "en", + "language": "English", + "manga" : text.unescape(extr("", " | Weeb Central")), + "author" : text.split_html(extr("<strong>Author", "</li>"))[1::2], + "tags" : text.split_html(extr("<strong>Tag", "</li>"))[1::2], + "type" : text.remove_html(extr("<strong>Type: ", "</li>")), + "status" : text.remove_html(extr("<strong>Status: ", "</li>")), + "release" : text.remove_html(extr("<strong>Released: ", "</li>")), + "official": ">Yes" in extr("<strong>Official Translatio", "</li>"), + "description": text.unescape(text.remove_html(extr( + "<strong>Description", "</li>"))), + } + + +class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): + """Extractor for manga chapters from weebcentral.com""" + pattern = BASE_PATTERN + r"(/chapters/(\w+))" + example = "https://weebcentral.com/chapters/01JHABCDEFGHIJKLMNOPQRSTUV" + + def metadata(self, page): + extr = text.extract_from(page) + manga_id = extr("'series_id': '", "'") + + data = self._extract_manga_data(manga_id) + data["chapter_id"] = self.groups[1] + data["chapter_type"] = extr("'chapter_type': '", "'") + + chapter, sep, minor = extr("'number': '", "'").partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + + return data + + def images(self, page): + referer = self.gallery_url + url = referer + "/images" + params = { + "is_prev" : "False", + "current_page" : "1", + "reading_style": "long_strip", + } + headers = { + "Accept" : "*/*", + "Referer" : referer, + "HX-Request" : "true", + "HX-Current-URL": referer, + } + page = self.request(url, params=params, headers=headers).text + extr = text.extract_from(page) + + results = [] + while True: + src = extr(' src="', '"') + if not src: + break + results.append((src, { + "width" : text.parse_int(extr(' width="' , '"')), + "height": text.parse_int(extr(' height="', '"')), + })) + return results + + +class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor): + """Extractor for manga from weebcentral.com""" + chapterclass = WeebcentralChapterExtractor + pattern = BASE_PATTERN + r"/series/(\w+)" + example = "https://weebcentral.com/series/01J7ABCDEFGHIJKLMNOPQRSTUV/TITLE" + + def __init__(self, match): + MangaExtractor.__init__(self, match, False) + + def chapters(self, _): + manga_id = self.groups[0] + referer = "{}/series/{}".format(self.root, manga_id) + url = referer + "/full-chapter-list" + headers = { + "Accept" : "*/*", + "Referer" : referer, + "HX-Request" : "true", + "HX-Target" : "chapter-list", + "HX-Current-URL": referer, + } + page = self.request(url, headers=headers).text + extr = text.extract_from(page) + data = self._extract_manga_data(manga_id) + base = self.root + "/chapters/" + + results = [] + while True: + chapter_id = extr("/chapters/", '"') + if not chapter_id: + break + type, _, chapter = extr('<span class="">', "<").partition(" ") + chapter, sep, minor = chapter.partition(".") + + chapter = { + "chapter_id" : chapter_id, + "chapter" : text.parse_int(chapter), + "chapter_minor": sep + minor, + "chapter_type" : type, + "date" : text.parse_datetime( + extr(' datetime="', '"')[:-5], "%Y-%m-%dT%H:%M:%S"), + } + chapter.update(data) + results.append((base + chapter_id, chapter)) + return results diff --git a/gallery_dl/option.py b/gallery_dl/option.py index a3f78e5..222679a 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -323,7 +323,7 @@ def build_parser(): input.add_argument( "--no-input", dest="input", nargs=0, action=ConfigConstAction, const=False, - help=("Do not prompt for passwords/tokens"), + help="Do not prompt for passwords/tokens", ) output = parser.add_argument_group("Output Options") @@ -406,7 +406,7 @@ def build_parser(): ) output.add_argument( "--list-extractors", - dest="list_extractors", metavar="CATEGORIES", nargs="*", + dest="list_extractors", metavar="[CATEGORIES]", nargs="*", help=("Print a list of extractor classes " "with description, (sub)category and example URL"), ) @@ -430,12 +430,12 @@ def build_parser(): output.add_argument( "--print-traffic", dest="print_traffic", action="store_true", - help=("Display sent and read HTTP traffic"), + help="Display sent and read HTTP traffic", ) output.add_argument( "--no-colors", dest="colors", action="store_false", - help=("Do not emit ANSI color codes in output"), + help="Do not emit ANSI color codes in output", ) networking = parser.add_argument_group("Networking Options") diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 72ec98e..2302088 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -83,7 +83,7 @@ def unique_sequence(iterable): def contains(values, elements, separator=" "): """Returns True if at least one of 'elements' is contained in 'values'""" - if isinstance(values, str): + if isinstance(values, str) and (separator or separator is None): values = values.split(separator) if not isinstance(elements, (tuple, list)): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 4b28924..6bceebd 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.28.3" +__version__ = "1.28.4" __variant__ = None -- cgit v1.2.3