From 0532a387ef5b7fcb4507a9b094dca37a5f635fe1 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 12 Jan 2025 21:27:05 -0500 Subject: New upstream version 1.28.4. --- CHANGELOG.md | 32 +++--- PKG-INFO | 22 ++++- README.rst | 4 +- data/completion/_gallery-dl | 2 +- data/man/gallery-dl.1 | 4 +- data/man/gallery-dl.conf.5 | 7 +- docs/gallery-dl.conf | 15 ++- gallery_dl.egg-info/PKG-INFO | 22 ++++- gallery_dl.egg-info/SOURCES.txt | 2 + gallery_dl/extractor/__init__.py | 2 + gallery_dl/extractor/bbc.py | 3 +- gallery_dl/extractor/bunkr.py | 36 ++++--- gallery_dl/extractor/cien.py | 7 +- gallery_dl/extractor/common.py | 14 ++- gallery_dl/extractor/e621.py | 2 +- gallery_dl/extractor/imagefap.py | 6 +- gallery_dl/extractor/mangapark.py | 3 +- gallery_dl/extractor/patreon.py | 15 ++- gallery_dl/extractor/pexels.py | 189 ++++++++++++++++++++++++++++++++++++ gallery_dl/extractor/pixiv.py | 21 +++- gallery_dl/extractor/plurk.py | 16 +-- gallery_dl/extractor/slideshare.py | 5 +- gallery_dl/extractor/wallhaven.py | 23 ++++- gallery_dl/extractor/weebcentral.py | 136 ++++++++++++++++++++++++++ gallery_dl/option.py | 8 +- gallery_dl/util.py | 2 +- gallery_dl/version.py | 2 +- test/test_util.py | 9 ++ 28 files changed, 511 insertions(+), 98 deletions(-) create mode 100644 gallery_dl/extractor/pexels.py create mode 100644 gallery_dl/extractor/weebcentral.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fc97ba..2c7e627 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,26 +1,16 @@ -## 1.28.3 - 2025-01-04 +## 1.28.4 - 2025-01-12 ### Extractors #### Additions -- [civitai] add `user-videos` extractor ([#6644](https://github.com/mikf/gallery-dl/issues/6644)) -- [szurubooru] support `visuabusters.com/booru` ([#6729](https://github.com/mikf/gallery-dl/issues/6729)) +- [pexels] add support ([#2286](https://github.com/mikf/gallery-dl/issues/2286), [#4214](https://github.com/mikf/gallery-dl/issues/4214), [#6769](https://github.com/mikf/gallery-dl/issues/6769)) +- [weebcentral] add support ([#6778](https://github.com/mikf/gallery-dl/issues/6778)) #### Fixes -- [8muses] skip albums without valid `permalink` ([#6717](https://github.com/mikf/gallery-dl/issues/6717)) -- [batoto] update domains ([#6714](https://github.com/mikf/gallery-dl/issues/6714)) -- [deviantart:tiptap] fix deviation embeds without `token` -- [hitomi] fix searches ([#6713](https://github.com/mikf/gallery-dl/issues/6713)) -- [instagram:reels] fix `pinned` values ([#6719](https://github.com/mikf/gallery-dl/issues/6719)) -- [kemonoparty] handle `discord` favorites ([#6706](https://github.com/mikf/gallery-dl/issues/6706)) -- [piczel] fix extraction ([#6735](https://github.com/mikf/gallery-dl/issues/6735)) -- [poipiku] fix downloads when post has a warning ([#6736](https://github.com/mikf/gallery-dl/issues/6736)) -- [sankaku] support alphanumeric book/pool IDs ([#6757](https://github.com/mikf/gallery-dl/issues/6757)) -- [subscribestar] fix attachment downloads ([#6721](https://github.com/mikf/gallery-dl/issues/6721), [#6724](https://github.com/mikf/gallery-dl/issues/6724), [#6758](https://github.com/mikf/gallery-dl/issues/6758)) -- [subscribestar] improve `content` metadata extraction ([#6761](https://github.com/mikf/gallery-dl/issues/6761)) -- [tapas] fix `TypeError` for locked episodes ([#6700](https://github.com/mikf/gallery-dl/issues/6700)) +- [bunkr] update to new site layout ([#6798](https://github.com/mikf/gallery-dl/issues/6798), [#6805](https://github.com/mikf/gallery-dl/issues/6805)) +- [bunkr] fix `ValueError` on relative redirects ([#6790](https://github.com/mikf/gallery-dl/issues/6790)) +- [plurk] fix `user` data extraction and make it non-fatal ([#6742](https://github.com/mikf/gallery-dl/issues/6742)) #### Improvements -- [boosty] support `file` post attachments ([#6760](https://github.com/mikf/gallery-dl/issues/6760)) -- [deviantart:tiptap] support more content block types ([#6686](https://github.com/mikf/gallery-dl/issues/6686)) -- [directlink] use domain as `subcategory` ([#6703](https://github.com/mikf/gallery-dl/issues/6703)) -- [hitomi] provide `search_tags` metadata for `tag` and `search` results ([#6756](https://github.com/mikf/gallery-dl/issues/6756)) -- [subscribestar] support `audio` files ([#6758](https://github.com/mikf/gallery-dl/issues/6758)) +- [bunkr] support `/f/` media URLs +- [e621] accept `tag` search URLs with empty tag ([#6783](https://github.com/mikf/gallery-dl/issues/6783)) +- [pixiv] provide fallback URLs ([#6762](https://github.com/mikf/gallery-dl/issues/6762)) +- [wallhaven] extract `search[tags]` and `search[tag_id]` metadata ([#6772](https://github.com/mikf/gallery-dl/issues/6772)) ### Miscellaneous -- [workflows:executables] build with Python 3.13 +- [util] support not splitting `value` argument when calling `contains()` ([#6773](https://github.com/mikf/gallery-dl/issues/6773)) diff --git a/PKG-INFO b/PKG-INFO index ecc3fc2..2d2156a 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.2 Name: gallery_dl -Version: 1.28.3 +Version: 1.28.4 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -38,6 +38,20 @@ License-File: LICENSE Requires-Dist: requests>=2.11.0 Provides-Extra: video Requires-Dist: youtube-dl; extra == "video" +Dynamic: author +Dynamic: author-email +Dynamic: classifier +Dynamic: description +Dynamic: download-url +Dynamic: home-page +Dynamic: keywords +Dynamic: license +Dynamic: maintainer +Dynamic: maintainer-email +Dynamic: provides-extra +Dynamic: requires-dist +Dynamic: requires-python +Dynamic: summary ========== gallery-dl @@ -117,9 +131,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/README.rst b/README.rst index 6ed729b..2a1a3c2 100644 --- a/README.rst +++ b/README.rst @@ -76,9 +76,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 073ac05..99fb8ad 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -32,7 +32,7 @@ _arguments -s -S \ {-N,--print}'[Write FORMAT during EVENT (default '\''prepare'\'') to standard output. Examples: '\''id'\'' or '\''post:{md5\[:8\]}'\'']':'<[event:]format>' \ --print-to-file'[Append FORMAT during EVENT to FILE]':'<[event:]format file>' \ --list-modules'[Print a list of available extractor modules]' \ ---list-extractors'[Print a list of extractor classes with description, (sub)category and example URL]':'' \ +--list-extractors'[Print a list of extractor classes with description, (sub)category and example URL]':'<[categories]>' \ --write-log'[Write logging output to FILE]':'':_files \ --write-unsupported'[Write URLs, which get emitted by other extractors but cannot be handled, to FILE]':'':_files \ --write-pages'[Write downloaded intermediary pages to files in the current directory to debug problems]' \ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index f4791df..ff83690 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2025-01-04" "1.28.3" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2025-01-12" "1.28.4" "gallery-dl Manual" .\" disable hyphenation .nh @@ -98,7 +98,7 @@ Append FORMAT during EVENT to FILE .B "\-\-list\-modules" Print a list of available extractor modules .TP -.B "\-\-list\-extractors" \f[I]CATEGORIES\f[] +.B "\-\-list\-extractors" \f[I][CATEGORIES]\f[] Print a list of extractor classes with description, (sub)category and example URL .TP .B "\-\-write\-log" \f[I]FILE\f[] diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 7028b7a..9ed6d97 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2025-01-04" "1.28.3" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2025-01-12" "1.28.4" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -471,13 +471,18 @@ response before \f[I]retrying\f[] the request. \f[I]soundgasm\f[], \f[I]urlgalleries\f[], \f[I]vk\f[], +\f[I]weebcentral\f[], \f[I]zerochan\f[] .br * \f[I]"1.0-2.0"\f[] \f[I]flickr\f[], +\f[I]pexels\f[], \f[I]weibo\f[], \f[I][wikimedia]\f[] .br +* \f[I]"1.4"\f[] +\f[I]wallhaven\f[] +.br * \f[I]"2.0-4.0"\f[] \f[I]behance\f[], \f[I]imagefap\f[], diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 3d73869..0d0c412 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -128,7 +128,7 @@ }, "bilibili": { - "sleep-request": "2.0-4.0" + "sleep-request": "3.0-6.0" }, "bluesky": { @@ -435,7 +435,12 @@ { "cookies": null, - "files" : ["images", "image_large", "attachments", "postfile", "content"] + "files" : ["images", "image_large", "attachments", "postfile", "content"], + "format-images": "download_url" + }, + "pexels": + { + "sleep-request": "1.0-2.0" }, "pillowfort": { @@ -691,6 +696,8 @@ "wallhaven": { "api-key" : null, + "sleep-request": "1.4", + "include" : ["uploads"], "metadata": false }, @@ -699,6 +706,10 @@ "api-key" : null, "metadata": false }, + "weebcentral": + { + "sleep-request": "0.5-1.5" + }, "weibo": { "sleep-request": "1.0-2.0", diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index ecc3fc2..2d2156a 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ -Metadata-Version: 2.1 +Metadata-Version: 2.2 Name: gallery_dl -Version: 1.28.3 +Version: 1.28.4 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -38,6 +38,20 @@ License-File: LICENSE Requires-Dist: requests>=2.11.0 Provides-Extra: video Requires-Dist: youtube-dl; extra == "video" +Dynamic: author +Dynamic: author-email +Dynamic: classifier +Dynamic: description +Dynamic: download-url +Dynamic: home-page +Dynamic: keywords +Dynamic: license +Dynamic: maintainer +Dynamic: maintainer-email +Dynamic: provides-extra +Dynamic: requires-dist +Dynamic: requires-python +Dynamic: summary ========== gallery-dl @@ -117,9 +131,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 42dd483..2656948 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -171,6 +171,7 @@ gallery_dl/extractor/nsfwalbum.py gallery_dl/extractor/oauth.py gallery_dl/extractor/paheal.py gallery_dl/extractor/patreon.py +gallery_dl/extractor/pexels.py gallery_dl/extractor/philomena.py gallery_dl/extractor/photovogue.py gallery_dl/extractor/picarto.py @@ -239,6 +240,7 @@ gallery_dl/extractor/warosu.py gallery_dl/extractor/weasyl.py gallery_dl/extractor/webmshare.py gallery_dl/extractor/webtoons.py +gallery_dl/extractor/weebcentral.py gallery_dl/extractor/weibo.py gallery_dl/extractor/wikiart.py gallery_dl/extractor/wikifeet.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d003a61..b582c99 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -124,6 +124,7 @@ modules = [ "nsfwalbum", "paheal", "patreon", + "pexels", "philomena", "photovogue", "picarto", @@ -190,6 +191,7 @@ modules = [ "weasyl", "webmshare", "webtoons", + "weebcentral", "weibo", "wikiart", "wikifeet", diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 54aaac4..113a669 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -26,8 +26,7 @@ class BbcGalleryExtractor(GalleryExtractor): example = "https://www.bbc.co.uk/programmes/PATH" def metadata(self, page): - data = util.json_loads(text.extr( - page, '')) + data = self._extract_jsonld(page) return { "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 3e12452..e1ee50d 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -80,6 +80,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): # redirect url = response.headers["Location"] + if url[0] == "/": + url = self.root + url + continue root, path = self._split(url) if root not in CF_DOMAINS: continue @@ -105,37 +108,40 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "All Bunkr domains require solving a CF challenge") # select alternative domain - root = "https://" + random.choice(DOMAINS) + self.root = root = "https://" + random.choice(DOMAINS) self.log.debug("Trying '%s' as fallback", root) url = root + path def fetch_album(self, album_id): # album metadata - page = self.request(self.root + "/a/" + album_id).text - title, size = text.split_html(text.extr( - page, "").partition(">")[2]) - if "&" in title: - title = title.replace( - "<", "<").replace(">", ">").replace("&", "&") + page = self.request( + self.root + "/a/" + album_id, encoding="utf-8").text + title = text.unescape(text.unescape(text.extr( + page, 'property="og:title" content="', '"'))) # files - items = list(text.extract_iter(page, "", "")) + items = list(text.extract_iter( + page, '
")) + return self._extract_files(items), { "album_id" : album_id, "album_name" : title, - "album_size" : text.extr(size, "(", ")"), + "album_size" : text.extr( + page, '(', ')'), "count" : len(items), } def _extract_files(self, items): for item in items: try: - url = text.extr(item, ' href="', '"') - file = self._extract_file(text.unescape(url)) + url = text.unescape(text.extr(item, ' href="', '"')) + if url[0] == "/": + url = self.root + url + file = self._extract_file(url) info = text.split_html(item) - file["name"] = info[0] - file["size"] = info[2] + file["name"] = info[-3] + file["size"] = info[-2] file["date"] = text.parse_datetime( info[-1], "%H:%M:%S %d/%m/%Y") @@ -179,8 +185,8 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.si media links""" subcategory = "media" directory_fmt = ("{category}",) - pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)" - example = "https://bunkr.si/v/FILENAME" + pattern = BASE_PATTERN + r"(/[fvid]/[^/?#]+)" + example = "https://bunkr.si/f/FILENAME" def fetch_album(self, album_id): try: diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py index 378365e..27d50e7 100644 --- a/gallery_dl/extractor/cien.py +++ b/gallery_dl/extractor/cien.py @@ -9,7 +9,7 @@ """Extractors for https://ci-en.net/""" from .common import Extractor, Message -from .. import text, util +from .. import text BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" @@ -56,11 +56,8 @@ class CienArticleExtractor(CienExtractor): self.root, self.groups[0], self.groups[1]) page = self.request(url, notfound="article").text - post = util.json_loads(text.extr( - page, ''))[0] - files = self._extract_files(page) - + post = self._extract_jsonld(page)[0] post["post_url"] = url post["post_id"] = text.parse_int(self.groups[1]) post["count"] = len(files) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5ada030..13fd88a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -587,6 +587,14 @@ class Extractor(): return True return False + def _extract_jsonld(self, page): + return util.json_loads(text.extr( + page, '")) + + def _extract_nextdata(self, page): + return util.json_loads(text.extr( + page, ' id="__NEXT_DATA__" type="application/json">', "")) + def _prepare_ddosguard_cookies(self): if not self.cookies.get("__ddg2", domain=self.cookies_domain): self.cookies.set( @@ -772,7 +780,11 @@ class MangaExtractor(Extractor): def items(self): self.login() - page = self.request(self.manga_url).text + + if self.manga_url: + page = self.request(self.manga_url, notfound=self.subcategory).text + else: + page = None chapters = self.chapters(page) if self.reverse: diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 4a6624d..33e6ba8 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -90,7 +90,7 @@ BASE_PATTERN = E621Extractor.update({ class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor): """Extractor for e621 posts from tag searches""" - pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)" + pattern = BASE_PATTERN + r"/posts?(?:\?[^#]*?tags=|/index/\d+/)([^&#]*)" example = "https://e621.net/posts?tags=TAG" diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 28590fc..dd5220d 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -9,7 +9,7 @@ """Extractors for https://www.imagefap.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, exception BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com" @@ -129,13 +129,11 @@ class ImagefapImageExtractor(ImagefapExtractor): url, pos = text.extract( page, 'original="', '"') - info, pos = text.extract( - page, '', pos) image_id, pos = text.extract( page, 'id="imageid_input" value="', '"', pos) gallery_id, pos = text.extract( page, 'id="galleryid_input" value="', '"', pos) - info = util.json_loads(info) + info = self._extract_jsonld(page) return url, text.nameext_from_url(url, { "title": text.unescape(info["name"]), diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 63aaf91..6f7a238 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -43,8 +43,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): ChapterExtractor.__init__(self, match, url) def metadata(self, page): - data = util.json_loads(text.extr( - page, 'id="__NEXT_DATA__" type="application/json">', '<')) + data = self._extract_nextdata(page) chapter = (data["props"]["pageProps"]["dehydratedState"] ["queries"][0]["state"]["data"]["data"]) manga = chapter["comicNode"]["data"] diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index e4a5985..866e93a 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -286,15 +286,12 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - data = text.extr( - page, 'id="__NEXT_DATA__" type="application/json">', '= pagination["total_pages"]: + return + params["page"] = pagination["current_page"] + 1 diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 6207bf7..d3e40ee 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -111,6 +111,7 @@ class PixivExtractor(Extractor): { "url" : img["image_urls"]["original"], "suffix": "_p{:02}".format(num), + "_fallback": self._fallback_image(img), } for num, img in enumerate(meta_pages) ] @@ -128,7 +129,7 @@ class PixivExtractor(Extractor): self.log.warning("%s: 'My pixiv' locked", work["id"]) elif work["type"] != "ugoira": - return ({"url": url},) + return ({"url": url, "_fallback": self._fallback_image(url)},) elif self.load_ugoira: try: @@ -269,6 +270,24 @@ class PixivExtractor(Extractor): except exception.HttpError: pass + def _fallback_image(self, src): + if isinstance(src, str): + urls = None + orig = src + else: + urls = src["image_urls"] + orig = urls["original"] + + base = orig.rpartition(".")[0] + yield base.replace("-original/", "-master/", 1) + "_master1200.jpg" + + if urls is None: + return + + for fmt in ("large", "medium", "square_medium"): + if fmt in urls: + yield urls[fmt] + @staticmethod def _date_from_url(url, offset=timedelta(hours=9)): try: diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index be0dbde..0bacd54 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -104,16 +104,16 @@ class PlurkPostExtractor(PlurkExtractor): pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)" example = "https://www.plurk.com/p/12345" - def __init__(self, match): - PlurkExtractor.__init__(self, match) - self.plurk_id = match.group(1) - def plurks(self): - url = "{}/p/{}".format(self.root, self.plurk_id) + url = "{}/p/{}".format(self.root, self.groups[0]) page = self.request(url).text - user, pos = text.extract(page, " GLOBAL = ", "\n") - data, pos = text.extract(page, "plurk = ", ";\n", pos) + user, pos = text.extract(page, " GLOBAL=", "\n") + data, pos = text.extract(page, "plurk =", ";\n", pos) data = self._load(data) - data["user"] = self._load(user)["page_user"] + try: + data["user"] = self._load(user)["page_user"] + except Exception: + self.log.warning("%s: Failed to extract 'user' data", + self.groups[0]) return (data,) diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index e5e7a6b..0722d23 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -10,7 +10,7 @@ """Extractors for https://www.slideshare.net/""" from .common import GalleryExtractor -from .. import text, util +from .. import text class SlidesharePresentationExtractor(GalleryExtractor): @@ -31,8 +31,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - data = util.json_loads(text.extr( - page, 'id="__NEXT_DATA__" type="application/json">', '')) + data = self._extract_nextdata(page) self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"] return { diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 479e8a8..e5b764a 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -54,7 +54,7 @@ class WallhavenExtractor(Extractor): class WallhavenSearchExtractor(WallhavenExtractor): """Extractor for search results on wallhaven.cc""" subcategory = "search" - directory_fmt = ("{category}", "{search[q]}") + directory_fmt = ("{category}", "{search[tags]}") archive_fmt = "s_{search[q]}_{id}" pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^#]+))?" example = "https://wallhaven.cc/search?q=QUERY" @@ -64,7 +64,7 @@ class WallhavenSearchExtractor(WallhavenExtractor): self.params = text.parse_query(match.group(1)) def wallpapers(self): - return self.api.search(self.params.copy()) + return self.api.search(self.params) def metadata(self): return {"search": self.params} @@ -141,7 +141,7 @@ class WallhavenUploadsExtractor(WallhavenExtractor): def wallpapers(self): params = {"q": "@" + self.username} - return self.api.search(params.copy()) + return self.api.search(params) def metadata(self): return {"username": self.username} @@ -215,20 +215,35 @@ class WallhavenAPI(): def _pagination(self, endpoint, params=None, metadata=None): if params is None: + params_ptr = None params = {} + else: + params_ptr = params + params = params.copy() if metadata is None: metadata = self.extractor.config("metadata") while True: data = self._call(endpoint, params) + meta = data.get("meta") + if params_ptr is not None: + if meta and "query" in meta: + query = meta["query"] + if isinstance(query, dict): + params_ptr["tags"] = query.get("tag") + params_ptr["tag_id"] = query.get("id") + else: + params_ptr["tags"] = query + params_ptr["tag_id"] = 0 + params_ptr = None + if metadata: for wp in data["data"]: yield self.info(str(wp["id"])) else: yield from data["data"] - meta = data.get("meta") if not meta or meta["current_page"] >= meta["last_page"]: return params["page"] = meta["current_page"] + 1 diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py new file mode 100644 index 0000000..39f998a --- /dev/null +++ b/gallery_dl/extractor/weebcentral.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://weebcentral.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?weebcentral\.com" + + +class WeebcentralBase(): + category = "weebcentral" + root = "https://weebcentral.com" + request_interval = (0.5, 1.5) + + @memcache(keyarg=1) + def _extract_manga_data(self, manga_id): + url = "{}/series/{}".format(self.root, manga_id) + page = self.request(url).text + extr = text.extract_from(page) + + return { + "manga_id": manga_id, + "lang" : "en", + "language": "English", + "manga" : text.unescape(extr("", " | Weeb Central")), + "author" : text.split_html(extr("<strong>Author", "</li>"))[1::2], + "tags" : text.split_html(extr("<strong>Tag", "</li>"))[1::2], + "type" : text.remove_html(extr("<strong>Type: ", "</li>")), + "status" : text.remove_html(extr("<strong>Status: ", "</li>")), + "release" : text.remove_html(extr("<strong>Released: ", "</li>")), + "official": ">Yes" in extr("<strong>Official Translatio", "</li>"), + "description": text.unescape(text.remove_html(extr( + "<strong>Description", "</li>"))), + } + + +class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): + """Extractor for manga chapters from weebcentral.com""" + pattern = BASE_PATTERN + r"(/chapters/(\w+))" + example = "https://weebcentral.com/chapters/01JHABCDEFGHIJKLMNOPQRSTUV" + + def metadata(self, page): + extr = text.extract_from(page) + manga_id = extr("'series_id': '", "'") + + data = self._extract_manga_data(manga_id) + data["chapter_id"] = self.groups[1] + data["chapter_type"] = extr("'chapter_type': '", "'") + + chapter, sep, minor = extr("'number': '", "'").partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + + return data + + def images(self, page): + referer = self.gallery_url + url = referer + "/images" + params = { + "is_prev" : "False", + "current_page" : "1", + "reading_style": "long_strip", + } + headers = { + "Accept" : "*/*", + "Referer" : referer, + "HX-Request" : "true", + "HX-Current-URL": referer, + } + page = self.request(url, params=params, headers=headers).text + extr = text.extract_from(page) + + results = [] + while True: + src = extr(' src="', '"') + if not src: + break + results.append((src, { + "width" : text.parse_int(extr(' width="' , '"')), + "height": text.parse_int(extr(' height="', '"')), + })) + return results + + +class WeebcentralMangaExtractor(WeebcentralBase, MangaExtractor): + """Extractor for manga from weebcentral.com""" + chapterclass = WeebcentralChapterExtractor + pattern = BASE_PATTERN + r"/series/(\w+)" + example = "https://weebcentral.com/series/01J7ABCDEFGHIJKLMNOPQRSTUV/TITLE" + + def __init__(self, match): + MangaExtractor.__init__(self, match, False) + + def chapters(self, _): + manga_id = self.groups[0] + referer = "{}/series/{}".format(self.root, manga_id) + url = referer + "/full-chapter-list" + headers = { + "Accept" : "*/*", + "Referer" : referer, + "HX-Request" : "true", + "HX-Target" : "chapter-list", + "HX-Current-URL": referer, + } + page = self.request(url, headers=headers).text + extr = text.extract_from(page) + data = self._extract_manga_data(manga_id) + base = self.root + "/chapters/" + + results = [] + while True: + chapter_id = extr("/chapters/", '"') + if not chapter_id: + break + type, _, chapter = extr('<span class="">', "<").partition(" ") + chapter, sep, minor = chapter.partition(".") + + chapter = { + "chapter_id" : chapter_id, + "chapter" : text.parse_int(chapter), + "chapter_minor": sep + minor, + "chapter_type" : type, + "date" : text.parse_datetime( + extr(' datetime="', '"')[:-5], "%Y-%m-%dT%H:%M:%S"), + } + chapter.update(data) + results.append((base + chapter_id, chapter)) + return results diff --git a/gallery_dl/option.py b/gallery_dl/option.py index a3f78e5..222679a 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -323,7 +323,7 @@ def build_parser(): input.add_argument( "--no-input", dest="input", nargs=0, action=ConfigConstAction, const=False, - help=("Do not prompt for passwords/tokens"), + help="Do not prompt for passwords/tokens", ) output = parser.add_argument_group("Output Options") @@ -406,7 +406,7 @@ def build_parser(): ) output.add_argument( "--list-extractors", - dest="list_extractors", metavar="CATEGORIES", nargs="*", + dest="list_extractors", metavar="[CATEGORIES]", nargs="*", help=("Print a list of extractor classes " "with description, (sub)category and example URL"), ) @@ -430,12 +430,12 @@ def build_parser(): output.add_argument( "--print-traffic", dest="print_traffic", action="store_true", - help=("Display sent and read HTTP traffic"), + help="Display sent and read HTTP traffic", ) output.add_argument( "--no-colors", dest="colors", action="store_false", - help=("Do not emit ANSI color codes in output"), + help="Do not emit ANSI color codes in output", ) networking = parser.add_argument_group("Networking Options") diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 72ec98e..2302088 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -83,7 +83,7 @@ def unique_sequence(iterable): def contains(values, elements, separator=" "): """Returns True if at least one of 'elements' is contained in 'values'""" - if isinstance(values, str): + if isinstance(values, str) and (separator or separator is None): values = values.split(separator) if not isinstance(elements, (tuple, list)): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 4b28924..6bceebd 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.28.3" +__version__ = "1.28.4" __variant__ = None diff --git a/test/test_util.py b/test/test_util.py index fa16c44..27f78ec 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -459,6 +459,15 @@ class TestOther(unittest.TestCase): self.assertFalse(util.contains(s, "tag1")) self.assertFalse(util.contains(s, ["tag1", "tag2", "tag3"])) + self.assertTrue(util.contains(s, "(+)", "")) + self.assertTrue(util.contains(s, ["(-)", "(+)"], "")) + self.assertTrue(util.contains(s, "(+)", 0)) + self.assertTrue(util.contains(s, "(+)", False)) + + self.assertFalse(util.contains(s, "(+)", None)) + self.assertTrue(util.contains(s, "y(+)c", None)) + self.assertTrue(util.contains(s, ["(-)", "(+)", "bar"], None)) + s = "1, 2, 3, asd, qwe, y(+)c, f(+)(-), bar" self.assertTrue(util.contains(s, "y(+)c", ", ")) self.assertTrue(util.contains(s, ["sdf", "dfg", "qwe"], ", ")) -- cgit v1.2.3