From a6e995c093de8aae2e91a0787281bb34c0b871eb Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Thu, 31 Jul 2025 01:22:01 -0400 Subject: New upstream version 1.30.2. --- gallery_dl/extractor/2ch.py | 25 +- gallery_dl/extractor/2chan.py | 16 +- gallery_dl/extractor/2chen.py | 6 +- gallery_dl/extractor/35photo.py | 23 +- gallery_dl/extractor/4archive.py | 17 +- gallery_dl/extractor/4chan.py | 33 +- gallery_dl/extractor/4chanarchives.py | 13 +- gallery_dl/extractor/500px.py | 12 +- gallery_dl/extractor/8chan.py | 19 +- gallery_dl/extractor/8muses.py | 18 +- gallery_dl/extractor/__init__.py | 28 +- gallery_dl/extractor/adultempire.py | 6 +- gallery_dl/extractor/agnph.py | 19 +- gallery_dl/extractor/ao3.py | 19 +- gallery_dl/extractor/arcalive.py | 40 +- gallery_dl/extractor/architizer.py | 11 +- gallery_dl/extractor/artstation.py | 167 ++++---- gallery_dl/extractor/aryion.py | 24 +- gallery_dl/extractor/batoto.py | 18 +- gallery_dl/extractor/bbc.py | 11 +- gallery_dl/extractor/behance.py | 57 +-- gallery_dl/extractor/bilibili.py | 76 ++-- gallery_dl/extractor/blogger.py | 58 ++- gallery_dl/extractor/bluesky.py | 60 ++- gallery_dl/extractor/booru.py | 3 +- gallery_dl/extractor/boosty.py | 20 +- gallery_dl/extractor/bunkr.py | 17 +- gallery_dl/extractor/catbox.py | 2 +- gallery_dl/extractor/chevereto.py | 4 +- gallery_dl/extractor/cien.py | 15 +- gallery_dl/extractor/civitai.py | 387 +++++++++++++----- gallery_dl/extractor/comick.py | 198 ++++++++++ gallery_dl/extractor/comicvine.py | 7 +- gallery_dl/extractor/common.py | 562 +++++++++++++++++--------- gallery_dl/extractor/cyberdrop.py | 8 +- gallery_dl/extractor/danbooru.py | 53 +-- gallery_dl/extractor/dankefuerslesen.py | 120 ++++++ gallery_dl/extractor/desktopography.py | 8 +- gallery_dl/extractor/deviantart.py | 162 ++++---- gallery_dl/extractor/directlink.py | 4 +- gallery_dl/extractor/discord.py | 89 ++--- gallery_dl/extractor/dynastyscans.py | 66 +++- gallery_dl/extractor/e621.py | 26 +- gallery_dl/extractor/erome.py | 106 ++--- gallery_dl/extractor/everia.py | 16 +- gallery_dl/extractor/exhentai.py | 208 ++++++---- gallery_dl/extractor/facebook.py | 179 +++++---- gallery_dl/extractor/fanbox.py | 130 +++--- gallery_dl/extractor/fantia.py | 8 +- gallery_dl/extractor/fapachi.py | 14 +- gallery_dl/extractor/fapello.py | 19 +- gallery_dl/extractor/flickr.py | 14 +- gallery_dl/extractor/foolfuuka.py | 85 ++-- gallery_dl/extractor/foolslide.py | 13 +- gallery_dl/extractor/furaffinity.py | 66 ++-- gallery_dl/extractor/furry34.py | 12 +- gallery_dl/extractor/fuskator.py | 17 +- gallery_dl/extractor/gelbooru.py | 18 +- gallery_dl/extractor/gelbooru_v01.py | 32 +- gallery_dl/extractor/gelbooru_v02.py | 73 ++-- gallery_dl/extractor/generic.py | 29 +- gallery_dl/extractor/girlsreleased.py | 76 ++++ gallery_dl/extractor/girlswithmuscle.py | 177 +++++++++ gallery_dl/extractor/gofile.py | 11 +- gallery_dl/extractor/hatenablog.py | 21 +- gallery_dl/extractor/hentai2read.py | 8 +- gallery_dl/extractor/hentaicosplays.py | 4 +- gallery_dl/extractor/hentaifoundry.py | 42 +- gallery_dl/extractor/hentaihand.py | 16 +- gallery_dl/extractor/hentaihere.py | 23 +- gallery_dl/extractor/hentainexus.py | 26 +- gallery_dl/extractor/hiperdex.py | 21 +- gallery_dl/extractor/hitomi.py | 157 ++++---- gallery_dl/extractor/hotleak.py | 11 +- gallery_dl/extractor/idolcomplex.py | 9 +- gallery_dl/extractor/imagebam.py | 18 +- gallery_dl/extractor/imagechest.py | 13 +- gallery_dl/extractor/imagefap.py | 40 +- gallery_dl/extractor/imagehosts.py | 38 +- gallery_dl/extractor/imgbb.py | 16 +- gallery_dl/extractor/imgbox.py | 29 +- gallery_dl/extractor/imgth.py | 11 +- gallery_dl/extractor/imgur.py | 32 +- gallery_dl/extractor/imhentai.py | 4 +- gallery_dl/extractor/inkbunny.py | 38 +- gallery_dl/extractor/instagram.py | 147 +++---- gallery_dl/extractor/issuu.py | 15 +- gallery_dl/extractor/itaku.py | 299 ++++++++++---- gallery_dl/extractor/itchio.py | 12 +- gallery_dl/extractor/iwara.py | 440 +++++++++++++++++++++ gallery_dl/extractor/jschan.py | 28 +- gallery_dl/extractor/kabeuchi.py | 20 +- gallery_dl/extractor/keenspot.py | 33 +- gallery_dl/extractor/kemono.py | 680 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/kemonoparty.py | 625 ----------------------------- gallery_dl/extractor/khinsider.py | 4 +- gallery_dl/extractor/koharu.py | 251 ------------ gallery_dl/extractor/komikcast.py | 37 +- gallery_dl/extractor/leakgallery.py | 141 +++++++ gallery_dl/extractor/lensdump.py | 8 +- gallery_dl/extractor/lexica.py | 10 +- gallery_dl/extractor/lightroom.py | 2 +- gallery_dl/extractor/livedoor.py | 14 +- gallery_dl/extractor/lofter.py | 2 +- gallery_dl/extractor/lolisafe.py | 8 +- gallery_dl/extractor/luscious.py | 14 +- gallery_dl/extractor/lynxchan.py | 25 +- gallery_dl/extractor/madokami.py | 93 +++++ gallery_dl/extractor/mangadex.py | 48 ++- gallery_dl/extractor/mangafox.py | 4 +- gallery_dl/extractor/mangahere.py | 18 +- gallery_dl/extractor/manganelo.py | 4 +- gallery_dl/extractor/mangapark.py | 35 +- gallery_dl/extractor/mangaread.py | 20 +- gallery_dl/extractor/mangasee.py | 117 ------ gallery_dl/extractor/mangoxo.py | 16 +- gallery_dl/extractor/mastodon.py | 27 +- gallery_dl/extractor/misskey.py | 107 ++++- gallery_dl/extractor/moebooru.py | 53 +-- gallery_dl/extractor/motherless.py | 140 +++++-- gallery_dl/extractor/myhentaigallery.py | 6 +- gallery_dl/extractor/myportfolio.py | 8 +- gallery_dl/extractor/naver.py | 174 -------- gallery_dl/extractor/naverblog.py | 173 ++++++++ gallery_dl/extractor/naverchzzk.py | 81 ++++ gallery_dl/extractor/naverwebtoon.py | 24 +- gallery_dl/extractor/nekohouse.py | 5 +- gallery_dl/extractor/newgrounds.py | 71 ++-- gallery_dl/extractor/nhentai.py | 7 +- gallery_dl/extractor/nijie.py | 47 +-- gallery_dl/extractor/nitter.py | 35 +- gallery_dl/extractor/nozomi.py | 25 +- gallery_dl/extractor/nsfwalbum.py | 16 +- gallery_dl/extractor/nudostar.py | 71 ++++ gallery_dl/extractor/oauth.py | 65 ++- gallery_dl/extractor/paheal.py | 28 +- gallery_dl/extractor/patreon.py | 206 +++++----- gallery_dl/extractor/pexels.py | 8 +- gallery_dl/extractor/philomena.py | 36 +- gallery_dl/extractor/photovogue.py | 4 +- gallery_dl/extractor/picarto.py | 6 +- gallery_dl/extractor/pictoa.py | 4 +- gallery_dl/extractor/piczel.py | 10 +- gallery_dl/extractor/pillowfort.py | 19 +- gallery_dl/extractor/pinterest.py | 63 +-- gallery_dl/extractor/pixeldrain.py | 33 +- gallery_dl/extractor/pixiv.py | 267 +++++++------ gallery_dl/extractor/pixnet.py | 18 +- gallery_dl/extractor/plurk.py | 22 +- gallery_dl/extractor/poipiku.py | 6 +- gallery_dl/extractor/poringa.py | 4 +- gallery_dl/extractor/pornhub.py | 40 +- gallery_dl/extractor/pornpics.py | 10 +- gallery_dl/extractor/postmill.py | 19 +- gallery_dl/extractor/rawkuma.py | 83 ++++ gallery_dl/extractor/reactor.py | 12 +- gallery_dl/extractor/readcomiconline.py | 45 +-- gallery_dl/extractor/realbooru.py | 11 +- gallery_dl/extractor/recursive.py | 9 +- gallery_dl/extractor/redbust.py | 186 +++++++++ gallery_dl/extractor/reddit.py | 44 +-- gallery_dl/extractor/redgifs.py | 39 +- gallery_dl/extractor/rule34us.py | 16 +- gallery_dl/extractor/rule34vault.py | 12 +- gallery_dl/extractor/rule34xyz.py | 40 +- gallery_dl/extractor/saint.py | 2 +- gallery_dl/extractor/sankaku.py | 45 +-- gallery_dl/extractor/sankakucomplex.py | 24 +- gallery_dl/extractor/schalenetwork.py | 246 ++++++++++++ gallery_dl/extractor/scrolller.py | 6 +- gallery_dl/extractor/seiga.py | 16 +- gallery_dl/extractor/senmanga.py | 2 +- gallery_dl/extractor/sexcom.py | 138 +++++-- gallery_dl/extractor/shimmie2.py | 27 +- gallery_dl/extractor/shopify.py | 16 +- gallery_dl/extractor/simplyhentai.py | 17 +- gallery_dl/extractor/skeb.py | 32 +- gallery_dl/extractor/slickpic.py | 12 +- gallery_dl/extractor/slideshare.py | 12 +- gallery_dl/extractor/smugmug.py | 18 +- gallery_dl/extractor/soundgasm.py | 6 +- gallery_dl/extractor/speakerdeck.py | 22 +- gallery_dl/extractor/steamgriddb.py | 24 +- gallery_dl/extractor/subscribestar.py | 32 +- gallery_dl/extractor/szurubooru.py | 17 +- gallery_dl/extractor/tapas.py | 19 +- gallery_dl/extractor/tcbscans.py | 4 +- gallery_dl/extractor/telegraph.py | 6 +- gallery_dl/extractor/tenor.py | 29 +- gallery_dl/extractor/tiktok.py | 29 +- gallery_dl/extractor/tmohentai.py | 9 +- gallery_dl/extractor/toyhouse.py | 8 +- gallery_dl/extractor/tsumino.py | 49 ++- gallery_dl/extractor/tumblr.py | 68 ++-- gallery_dl/extractor/tumblrgallery.py | 23 +- gallery_dl/extractor/twibooru.py | 18 +- gallery_dl/extractor/twitter.py | 245 ++++++------ gallery_dl/extractor/unsplash.py | 23 +- gallery_dl/extractor/uploadir.py | 6 +- gallery_dl/extractor/urlgalleries.py | 7 +- gallery_dl/extractor/urlshortener.py | 2 +- gallery_dl/extractor/vanillarock.py | 4 +- gallery_dl/extractor/vichan.py | 57 ++- gallery_dl/extractor/vipergirls.py | 20 +- gallery_dl/extractor/vk.py | 82 ++-- gallery_dl/extractor/vsco.py | 58 ++- gallery_dl/extractor/wallhaven.py | 39 +- gallery_dl/extractor/wallpapercave.py | 9 +- gallery_dl/extractor/warosu.py | 19 +- gallery_dl/extractor/weasyl.py | 27 +- gallery_dl/extractor/webmshare.py | 6 +- gallery_dl/extractor/webtoons.py | 116 +++--- gallery_dl/extractor/weebcentral.py | 9 +- gallery_dl/extractor/weibo.py | 65 ++- gallery_dl/extractor/wikiart.py | 37 +- gallery_dl/extractor/wikifeet.py | 13 +- gallery_dl/extractor/wikimedia.py | 29 +- gallery_dl/extractor/xfolio.py | 20 +- gallery_dl/extractor/xhamster.py | 6 +- gallery_dl/extractor/xvideos.py | 15 +- gallery_dl/extractor/yiffverse.py | 12 +- gallery_dl/extractor/ytdl.py | 19 +- gallery_dl/extractor/zerochan.py | 16 +- gallery_dl/extractor/zzup.py | 15 +- 224 files changed, 7062 insertions(+), 4902 deletions(-) create mode 100644 gallery_dl/extractor/comick.py create mode 100644 gallery_dl/extractor/dankefuerslesen.py create mode 100644 gallery_dl/extractor/girlsreleased.py create mode 100644 gallery_dl/extractor/girlswithmuscle.py create mode 100644 gallery_dl/extractor/iwara.py create mode 100644 gallery_dl/extractor/kemono.py delete mode 100644 gallery_dl/extractor/kemonoparty.py delete mode 100644 gallery_dl/extractor/koharu.py create mode 100644 gallery_dl/extractor/leakgallery.py create mode 100644 gallery_dl/extractor/madokami.py delete mode 100644 gallery_dl/extractor/mangasee.py delete mode 100644 gallery_dl/extractor/naver.py create mode 100644 gallery_dl/extractor/naverblog.py create mode 100644 gallery_dl/extractor/naverchzzk.py create mode 100644 gallery_dl/extractor/nudostar.py create mode 100644 gallery_dl/extractor/rawkuma.py create mode 100644 gallery_dl/extractor/redbust.py create mode 100644 gallery_dl/extractor/schalenetwork.py (limited to 'gallery_dl/extractor') diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index dbbf21b..f5bb7b7 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -26,8 +26,8 @@ class _2chThreadExtractor(Extractor): self.board, self.thread = match.groups() def items(self): - url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) - posts = self.request(url).json()["threads"][0]["posts"] + url = f"{self.root}/{self.board}/res/{self.thread}.json" + posts = self.request_json(url)["threads"][0]["posts"] op = posts[0] title = op.get("subject") or text.remove_html(op["comment"]) @@ -40,8 +40,7 @@ class _2chThreadExtractor(Extractor): yield Message.Directory, thread for post in posts: - files = post.get("files") - if files: + if files := post.get("files"): post["post_name"] = post["name"] post["date"] = text.parse_timestamp(post["timestamp"]) del post["files"] @@ -68,24 +67,24 @@ class _2chBoardExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.board = match.group(1) + self.board = match[1] def items(self): + base = f"{self.root}/{self.board}" + # index page - url = "{}/{}/index.json".format(self.root, self.board) - index = self.request(url).json() + url = f"{base}/index.json" + index = self.request_json(url) index["_extractor"] = _2chThreadExtractor for thread in index["threads"]: - url = "{}/{}/res/{}.html".format( - self.root, self.board, thread["thread_num"]) + url = f"{base}/res/{thread['thread_num']}.html" yield Message.Queue, url, index # pages 1..n for n in util.advance(index["pages"], 1): - url = "{}/{}/{}.json".format(self.root, self.board, n) - page = self.request(url).json() + url = f"{base}/{n}.json" + page = self.request_json(url) page["_extractor"] = _2chThreadExtractor for thread in page["threads"]: - url = "{}/{}/res/{}.html".format( - self.root, self.board, thread["thread_num"]) + url = f"{base}/res/{thread['thread_num']}.html" yield Message.Queue, url, page diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index 337ba48..9927b5a 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,7 +19,6 @@ class _2chanThreadExtractor(Extractor): directory_fmt = ("{category}", "{board_name}", "{thread}") filename_fmt = "{tim}.{extension}" archive_fmt = "{board}_{thread}_{tim}" - url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/?#]+)/res/(\d+)" example = "https://dec.2chan.net/12/res/12345.htm" @@ -28,8 +27,8 @@ class _2chanThreadExtractor(Extractor): self.server, self.board, self.thread = match.groups() def items(self): - url = "https://{}.2chan.net/{}/res/{}.htm".format( - self.server, self.board, self.thread) + url = (f"https://{self.server}.2chan.net" + f"/{self.board}/res/{self.thread}.htm") page = self.request(url).text data = self.metadata(page) yield Message.Directory, data @@ -37,7 +36,8 @@ class _2chanThreadExtractor(Extractor): if "filename" not in post: continue post.update(data) - url = self.url_fmt.format_map(post) + url = (f"https://{post['server']}.2chan.net" + f"/{post['board']}/src/{post['filename']}") yield Message.Url, url, post def metadata(self, page): @@ -74,8 +74,7 @@ class _2chanThreadExtractor(Extractor): data["ext"] = "." + data["extension"] return data - @staticmethod - def _extract_post(post): + def _extract_post(self, post): return text.extract_all(post, ( ("post", 'class="csb">' , '<'), ("name", 'class="cnm">' , '<'), @@ -85,8 +84,7 @@ class _2chanThreadExtractor(Extractor): ("com" , '>', ''), ))[0] - @staticmethod - def _extract_image(post, data): + def _extract_image(self, post, data): text.extract_all(post, ( (None , '_blank', ''), ("filename", '>', '<'), diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 0c97889..ee3510c 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -28,7 +28,7 @@ class _2chenThreadExtractor(Extractor): self.board, self.thread = match.groups() def items(self): - url = "{}/{}/{}".format(self.root, self.board, self.thread) + url = f"{self.root}/{self.board}/{self.thread}" page = self.request(url, encoding="utf-8", notfound="thread").text data = self.metadata(page) yield Message.Directory, data @@ -86,10 +86,10 @@ class _2chenBoardExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.board = match.group(1) + self.board = match[1] def items(self): - url = "{}/{}/catalog".format(self.root, self.board) + url = f"{self.root}/{self.board}/catalog" page = self.request(url, notfound="board").text data = {"_extractor": _2chenThreadExtractor} for thread in text.extract_iter( diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 773116e..ec5f0cb 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -49,14 +49,14 @@ class _35photoExtractor(Extractor): if extra_ids: yield from extra_ids while params["lastId"]: - data = self.request(url, headers=headers, params=params).json() + data = self.request_json(url, headers=headers, params=params) yield from self._photo_ids(data["data"]) params["lastId"] = data["lastId"] def _photo_data(self, photo_id): params = {"method": "photo.getData", "photoId": photo_id} - data = self.request( - "https://api.35photo.pro/", params=params).json()["data"][photo_id] + data = self.request_json( + "https://api.35photo.pro/", params=params)["data"][photo_id] info = { "url" : data["src"], "id" : data["photo_id"], @@ -83,8 +83,7 @@ class _35photoExtractor(Extractor): info["num"] = 1 yield info - @staticmethod - def _photo_ids(page): + def _photo_ids(self, page): """Extract unique photo IDs and return them as sorted list""" # searching for photo-id="..." doesn't always work (see unit tests) if not page: @@ -105,11 +104,11 @@ class _35photoUserExtractor(_35photoExtractor): def __init__(self, match): _35photoExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match[1] self.user_id = 0 def metadata(self): - url = "{}/{}/".format(self.root, self.user) + url = f"{self.root}/{self.user}/" page = self.request(url).text self.user_id = text.parse_int(text.extr(page, "/user_", ".xml")) return { @@ -134,7 +133,7 @@ class _35photoTagExtractor(_35photoExtractor): def __init__(self, match): _35photoExtractor.__init__(self, match) - self.tag = match.group(1) + self.tag = match[1] def metadata(self): return {"search_tag": text.unquote(self.tag).lower()} @@ -143,7 +142,7 @@ class _35photoTagExtractor(_35photoExtractor): num = 1 while True: - url = "{}/tags/{}/list_{}/".format(self.root, self.tag, num) + url = f"{self.root}/tags/{self.tag}/list_{num}/" page = self.request(url).text prev = None @@ -171,7 +170,7 @@ class _35photoGenreExtractor(_35photoExtractor): self.photo_ids = None def metadata(self): - url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/") + url = f"{self.root}/genre_{self.genre_id}{self.new or '/'}" page = self.request(url).text self.photo_ids = self._photo_ids(text.extr( page, ' class="photo', '\n')) @@ -199,7 +198,7 @@ class _35photoImageExtractor(_35photoExtractor): def __init__(self, match): _35photoExtractor.__init__(self, match) - self.photo_id = match.group(1) + self.photo_id = match[1] def photos(self): return (self.photo_id,) diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index d198369..c9be2a4 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -27,8 +27,7 @@ class _4archiveThreadExtractor(Extractor): self.board, self.thread = match.groups() def items(self): - url = "{}/board/{}/thread/{}".format( - self.root, self.board, self.thread) + url = f"{self.root}/board/{self.board}/thread/{self.thread}" page = self.request(url).text data = self.metadata(page) posts = self.posts(page) @@ -58,15 +57,14 @@ class _4archiveThreadExtractor(Extractor): for post in page.split('class="postContainer')[1:] ] - @staticmethod - def parse(post): + def parse(self, post): extr = text.extract_from(post) data = { "name": extr('class="name">', ""), "date": text.parse_datetime( extr('class="dateTime postNum" >', "<").strip(), "%Y-%m-%d %H:%M:%S"), - "no" : text.parse_int(extr('href="#p', '"')), + "no" : text.parse_int(extr(">Post No.", "<")), } if 'class="file"' in post: extr('class="fileText"', ">File: ")[2]), } - @staticmethod - def _extract_file(html, post): + def _extract_file(self, html, post): extr = text.extract_from(html, html.index(">File: <")) post["url"] = extr('href="', '"') post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0]) @@ -106,7 +103,7 @@ class _4chanarchivesBoardExtractor(Extractor): data["pageCount"]: return - url = "{}/{}/{}.json".format(self.root, board, pnum) - threads = self.request(url).json()["threads"] + url = f"{self.root}/{board}/{pnum}.json" + threads = self.request_json(url)["threads"] diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index 68b906e..120cd8a 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,8 +26,8 @@ class _8musesAlbumExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.path = match.group(1) - self.params = match.group(2) or "" + self.path = match[1] + self.params = match[2] or "" def items(self): url = self.root + self.path + self.params @@ -37,8 +37,7 @@ class _8musesAlbumExtractor(Extractor): self.request(url).text, 'id="ractive-public" type="text/plain">', '')) - images = data.get("pictures") - if images: + if images := data.get("pictures"): count = len(images) album = self._make_album(data["album"]) yield Message.Directory, {"album": album, "count": count} @@ -54,8 +53,7 @@ class _8musesAlbumExtractor(Extractor): } yield Message.Url, url, img - albums = data.get("albums") - if albums: + if albums := data.get("albums"): for album in albums: permalink = album.get("permalink") if not permalink: @@ -74,8 +72,7 @@ class _8musesAlbumExtractor(Extractor): return path, _, num = self.path.rstrip("/").rpartition("/") path = path if num.isdecimal() else self.path - url = "{}{}/{}{}".format( - self.root, path, data["page"] + 1, self.params) + url = f"{self.root}{path}/{data['page'] + 1}{self.params}" def _make_album(self, album): return { @@ -92,8 +89,7 @@ class _8musesAlbumExtractor(Extractor): album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), } - @staticmethod - def _unobfuscate(data): + def _unobfuscate(self, data): return util.json_loads("".join([ chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c for c in text.unescape(data.strip("\t\n\r !")) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 2da471e..688f0a0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. import sys -from ..util import re_compile +from ..text import re_compile modules = [ "2ch", @@ -40,9 +40,11 @@ modules = [ "chevereto", "cien", "civitai", + "comick", "comicvine", "cyberdrop", "danbooru", + "dankefuerslesen", "desktopography", "deviantart", "discord", @@ -63,6 +65,8 @@ modules = [ "gelbooru", "gelbooru_v01", "gelbooru_v02", + "girlsreleased", + "girlswithmuscle", "gofile", "hatenablog", "hentai2read", @@ -88,13 +92,14 @@ modules = [ "issuu", "itaku", "itchio", + "iwara", "jschan", "kabeuchi", "keenspot", - "kemonoparty", + "kemono", "khinsider", - "koharu", "komikcast", + "leakgallery", "lensdump", "lexica", "lightroom", @@ -102,19 +107,20 @@ modules = [ "lofter", "luscious", "lynxchan", + "madokami", "mangadex", "mangafox", "mangahere", "manganelo", "mangapark", "mangaread", - "mangasee", "mangoxo", "misskey", "motherless", "myhentaigallery", "myportfolio", - "naver", + "naverblog", + "naverchzzk", "naverwebtoon", "nekohouse", "newgrounds", @@ -123,6 +129,7 @@ modules = [ "nitter", "nozomi", "nsfwalbum", + "nudostar", "paheal", "patreon", "pexels", @@ -142,9 +149,11 @@ modules = [ "pornhub", "pornpics", "postmill", + "rawkuma", "reactor", "readcomiconline", "realbooru", + "redbust", "reddit", "redgifs", "rule34us", @@ -153,6 +162,7 @@ modules = [ "saint", "sankaku", "sankakucomplex", + "schalenetwork", "scrolller", "seiga", "senmanga", @@ -226,8 +236,7 @@ modules = [ def find(url): """Find a suitable extractor for the given URL""" for cls in _list_classes(): - match = cls.pattern.match(url) - if match: + if match := cls.pattern.match(url): return cls(match) return None @@ -242,8 +251,7 @@ def add(cls): def add_module(module): """Add all extractors in 'module' to the list of available extractors""" - classes = _get_classes(module) - if classes: + if classes := _get_classes(module): if isinstance(classes[0].pattern, str): for cls in classes: cls.pattern = re_compile(cls.pattern) diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index c891b17..3249ae6 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,7 +22,7 @@ class AdultempireGalleryExtractor(GalleryExtractor): def __init__(self, match): GalleryExtractor.__init__(self, match) - self.gallery_id = match.group(2) + self.gallery_id = match[2] def _init(self): self.cookies.set("ageConfirmed", "true", domain="www.adultempire.com") @@ -48,4 +48,4 @@ class AdultempireGalleryExtractor(GalleryExtractor): if len(urls) < 24: return params["page"] += 1 - page = self.request(self.gallery_url, params=params).text + page = self.request(self.page_url, params=params).text diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py index 653b73f..5bb1835 100644 --- a/gallery_dl/extractor/agnph.py +++ b/gallery_dl/extractor/agnph.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,11 +9,8 @@ """Extractors for https://agn.ph/""" from . import booru -from .. import text - -from xml.etree import ElementTree +from .. import text, util import collections -import re BASE_PATTERN = r"(?:https?://)?agn\.ph" @@ -52,8 +49,7 @@ class AgnphExtractor(booru.BooruExtractor): params["page"] = self.page_start while True: - data = self.request(url, params=params).text - root = ElementTree.fromstring(data) + root = self.request_xml(url, params=params) yield from map(self._xml_to_dict, root) @@ -64,7 +60,7 @@ class AgnphExtractor(booru.BooruExtractor): params["page"] += 1 def _html(self, post): - url = "{}/gallery/post/show/{}/".format(self.root, post["id"]) + url = f"{self.root}/gallery/post/show/{post['id']}/" return self.request(url).text def _tags(self, post, page): @@ -74,7 +70,7 @@ class AgnphExtractor(booru.BooruExtractor): return tags = collections.defaultdict(list) - pattern = re.compile(r'class="(.)typetag">([^<]+)') + pattern = util.re(r'class="(.)typetag">([^<]+)') for tag_type, tag_name in pattern.findall(tag_container): tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) for key, value in tags.items(): @@ -107,7 +103,6 @@ class AgnphPostExtractor(AgnphExtractor): example = "https://agn.ph/gallery/post/show/12345/" def posts(self): - url = "{}/gallery/post/show/{}/?api=xml".format( - self.root, self.groups[0]) - post = ElementTree.fromstring(self.request(url).text) + url = f"{self.root}/gallery/post/show/{self.groups[0]}/?api=xml" + post = self.request_xml(url) return (self._xml_to_dict(post),) diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py index d3ab846..2652acb 100644 --- a/gallery_dl/extractor/ao3.py +++ b/gallery_dl/extractor/ao3.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://archiveofourown.org/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache @@ -135,7 +135,7 @@ class Ao3WorkExtractor(Ao3Extractor): self.login() work_id = self.groups[0] - url = "{}/works/{}".format(self.root, work_id) + url = f"{self.root}/works/{work_id}" response = self.request(url, notfound="work") if response.url.endswith("/users/login?restricted=true"): @@ -144,7 +144,7 @@ class Ao3WorkExtractor(Ao3Extractor): page = response.text if len(page) < 20000 and \ '

Adult Content Warning]+)").findall( + post["content"]): if not self.emoticons and 'class="arca-emoticon"' in media: continue @@ -75,36 +74,37 @@ class ArcalivePostExtractor(ArcaliveExtractor): if not src: continue - src = text.unescape(src.partition("?")[0]) + src, _, query = text.unescape(src).partition("?") if src[0] == "/": if src[1] == "/": - url = "https:" + src + url = "https:" + src.replace( + "//ac-p.namu", "//ac-o.namu", 1) else: url = self.root + src else: url = src fallback = () - orig = text.extr(media, 'data-orig="', '"') - if orig: + query = f"?type=orig&{query}" + if orig := text.extr(media, 'data-orig="', '"'): path, _, ext = url.rpartition(".") if ext != orig: - fallback = (url + "?type=orig",) + fallback = (url + query,) url = path + "." + orig elif video and self.gifs: url_gif = url.rpartition(".")[0] + ".gif" if self.gifs_fallback: - fallback = (url + "?type=orig",) + fallback = (url + query,) url = url_gif else: response = self.request( - url_gif + "?type=orig", method="HEAD", fatal=False) + url_gif + query, method="HEAD", fatal=False) if response.status_code < 400: - fallback = (url + "?type=orig",) + fallback = (url + query,) url = url_gif files.append({ - "url" : url + "?type=orig", + "url" : url + query, "width" : text.parse_int(text.extr(media, 'width="', '"')), "height": text.parse_int(text.extr(media, 'height="', '"')), "_fallback": fallback, @@ -112,11 +112,6 @@ class ArcalivePostExtractor(ArcaliveExtractor): return files - def _extract_media(self, content): - ArcalivePostExtractor._extract_media = extr = re.compile( - r"<(?:img|vide(o)) ([^>]+)").findall - return extr(content) - class ArcaliveBoardExtractor(ArcaliveExtractor): """Extractor for an arca.live board's posts""" @@ -175,9 +170,8 @@ class ArcaliveAPI(): return data self.log.debug("Server response: %s", data) - msg = data.get("message") - raise exception.StopExtraction( - "API request failed%s", ": " + msg if msg else "") + msg = f": {msg}" if (msg := data.get("message")) else "" + raise exception.AbortExtraction(f"API request failed{msg}") def _pagination(self, endpoint, params, key): while True: diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py index 911753b..e39d3d2 100644 --- a/gallery_dl/extractor/architizer.py +++ b/gallery_dl/extractor/architizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,7 +24,7 @@ class ArchitizerProjectExtractor(GalleryExtractor): example = "https://architizer.com/projects/NAME/" def __init__(self, match): - url = "{}/projects/{}/".format(self.root, match.group(1)) + url = f"{self.root}/projects/{match[1]}/" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -68,15 +68,14 @@ class ArchitizerFirmExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.firm = match.group(1) + self.firm = match[1] def items(self): - url = url = "{}/firms/{}/?requesting_merlin=pages".format( - self.root, self.firm) + url = url = f"{self.root}/firms/{self.firm}/?requesting_merlin=pages" page = self.request(url).text data = {"_extractor": ArchitizerProjectExtractor} for project in text.extract_iter(page, '= 400: @@ -141,9 +140,9 @@ class AryionExtractor(Extractor): # fix 'Last-Modified' header lmod = headers["last-modified"] if lmod[22] != ":": - lmod = "{}:{} GMT".format(lmod[:22], lmod[22:24]) + lmod = f"{lmod[:22]}:{lmod[22:24]} GMT" - post_url = "{}/g4/view/{}".format(self.root, post_id) + post_url = f"{self.root}/g4/view/{post_id}" extr = text.extract_from(self.request(post_url).text) title, _, artist = text.unescape(extr( @@ -195,10 +194,10 @@ class AryionGalleryExtractor(AryionExtractor): def posts(self): if self.recursive: - url = "{}/g4/gallery/{}".format(self.root, self.user) + url = f"{self.root}/g4/gallery/{self.user}" return self._pagination_params(url) else: - url = "{}/g4/latest.php?name={}".format(self.root, self.user) + url = f"{self.root}/g4/latest.php?name={self.user}" return util.advance(self._pagination_next(url), self.offset) @@ -212,9 +211,8 @@ class AryionFavoriteExtractor(AryionExtractor): example = "https://aryion.com/g4/favorites/USER" def posts(self): - url = "{}/g4/favorites/{}".format(self.root, self.user) - return self._pagination_params( - url, None, "class='gallery-item favorite' id='") + url = f"{self.root}/g4/favorites/{self.user}" + return self._pagination_params(url, None, "data-item-id='") class AryionTagExtractor(AryionExtractor): diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index a1ad3ae..50e0c5d 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -7,8 +7,7 @@ """Extractors for https://bato.to/""" from .common import Extractor, ChapterExtractor, MangaExtractor -from .. import text, exception -import re +from .. import text, util BASE_PATTERN = (r"(?:https?://)?(" r"(?:ba|d|f|h|j|m|w)to\.to|" @@ -87,7 +86,7 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): ChapterExtractor.__init__(self, match, False) self._init_root() self.chapter_id = self.groups[1] - self.gallery_url = "{}/title/0/{}".format(self.root, self.chapter_id) + self.page_url = f"{self.root}/title/0/{self.chapter_id}" def metadata(self, page): extr = text.extract_from(page) @@ -104,9 +103,9 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor): info = text.remove_html(extr('link-hover">', "', "<") - if warning: - raise exception.StopExtraction("'%s'", text.remove_html(warning)) + if warning := extr(' class="alert alert-warning">', ""): + self.log.warning("'%s'", text.remove_html(warning)) data = { "manga_id": text.parse_int(self.manga_id), @@ -178,6 +176,6 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor): data["date"] = text.parse_datetime( extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ") - url = "{}/title/{}".format(self.root, href) + url = f"{self.root}/title/{href}" results.append((url, data.copy())) return results diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index b398152..8efb3db 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -33,7 +33,7 @@ class BbcGalleryExtractor(GalleryExtractor): page, "

", "

").rpartition("")[2]), "description": text.unescape(text.extr( page, 'property="og:description" content="', '"')), - "programme": self.gallery_url.split("/")[4], + "programme": self.page_url.split("/")[4], "path": list(util.unique_sequence( element["name"] for element in data["itemListElement"] @@ -43,7 +43,7 @@ class BbcGalleryExtractor(GalleryExtractor): def images(self, page): width = self.config("width") width = width - width % 16 if width else 1920 - dimensions = "/{}xn/".format(width) + dimensions = f"/{width}xn/" results = [] for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"): @@ -60,12 +60,11 @@ class BbcGalleryExtractor(GalleryExtractor): )) return results - @staticmethod - def _fallback_urls(src, max_width): + def _fallback_urls(self, src, max_width): front, _, back = src.partition("/320x180_b/") for width in (1920, 1600, 1280, 976): if width < max_width: - yield "{}/{}xn/{}".format(front, width, back) + yield f"{front}/{width}xn/{back}" class BbcProgrammeExtractor(Extractor): diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 14598b7..4a7c074 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,8 @@ class BehanceExtractor(Extractor): category = "behance" root = "https://www.behance.net" request_interval = (2.0, 4.0) + browser = "firefox" + tls12 = False def _init(self): self._bcp = self.cookies.get("bcp", domain="www.behance.net") @@ -44,15 +46,15 @@ class BehanceExtractor(Extractor): "variables": variables, } - return self.request(url, method="POST", headers=headers, - json=data).json()["data"] + return self.request_json( + url, method="POST", headers=headers, json=data)["data"] def _update(self, data): # compress data to simple lists - if data.get("fields") and isinstance(data["fields"][0], dict): + if (fields := data.get("fields")) and isinstance(fields[0], dict): data["fields"] = [ field.get("name") or field.get("label") - for field in data["fields"] + for field in fields ] data["owners"] = [ @@ -68,6 +70,9 @@ class BehanceExtractor(Extractor): data["date"] = text.parse_timestamp( data.get("publishedOn") or data.get("conceived_on") or 0) + if creator := data.get("creator"): + creator["name"] = creator["url"].rpartition("/")[2] + # backwards compatibility data["gallery_id"] = data["id"] data["title"] = data["name"] @@ -87,13 +92,12 @@ class BehanceGalleryExtractor(BehanceExtractor): def __init__(self, match): BehanceExtractor.__init__(self, match) - self.gallery_id = match.group(1) + self.gallery_id = match[1] def _init(self): BehanceExtractor._init(self) - modules = self.config("modules") - if modules: + if modules := self.config("modules"): if isinstance(modules, str): modules = modules.split(",") self.modules = set(modules) @@ -114,12 +118,15 @@ class BehanceGalleryExtractor(BehanceExtractor): def get_gallery_data(self): """Collect gallery info dict""" - url = "{}/gallery/{}/a".format(self.root, self.gallery_id) + url = f"{self.root}/gallery/{self.gallery_id}/a" cookies = { - "gki": '{"feature_project_view":false,' - '"feature_discover_login_prompt":false,' - '"feature_project_login_prompt":false}', + "gk_suid": "14118261", + "gki": "feature_3_in_1_checkout_test:false,hire_browse_get_quote_c" + "ta_ab_test:false,feature_hire_dashboard_services_ab_test:f" + "alse,feature_show_details_jobs_row_ab_test:false,feature_a" + "i_freelance_project_create_flow:false,", "ilo0": "true", + "originalReferrer": "", } page = self.request(url, cookies=cookies).text @@ -141,9 +148,7 @@ class BehanceGalleryExtractor(BehanceExtractor): raise exception.AuthorizationError() return () - result = [] - append = result.append - + results = [] for module in data["modules"]: mtype = module["__typename"][:-6].lower() @@ -161,7 +166,7 @@ class BehanceGalleryExtractor(BehanceExtractor): sizes.get("fs") or sizes.get("hd") or sizes.get("disp")) - append((size["url"], module)) + results.append((size["url"], module)) elif mtype == "video": try: @@ -173,7 +178,7 @@ class BehanceGalleryExtractor(BehanceExtractor): url = "ytdl:" + url module["_ytdl_manifest"] = "hls" module["extension"] = "mp4" - append((url, module)) + results.append((url, module)) continue except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) @@ -194,7 +199,7 @@ class BehanceGalleryExtractor(BehanceExtractor): self.log.debug("%s: %s", exc.__class__.__name__, exc) url = "ytdl:" + renditions[-1]["url"] - append((url, module)) + results.append((url, module)) elif mtype == "mediacollection": for component in module["components"]: @@ -202,21 +207,21 @@ class BehanceGalleryExtractor(BehanceExtractor): if size: parts = size["url"].split("/") parts[4] = "source" - append(("/".join(parts), module)) + results.append(("/".join(parts), module)) break elif mtype == "embed": - embed = module.get("originalEmbed") or module.get("fluidEmbed") - if embed: + if embed := (module.get("originalEmbed") or + module.get("fluidEmbed")): embed = text.unescape(text.extr(embed, 'src="', '"')) module["extension"] = "mp4" - append(("ytdl:" + embed, module)) + results.append(("ytdl:" + embed, module)) elif mtype == "text": module["extension"] = "txt" - append(("text:" + module["text"], module)) + results.append(("text:" + module["text"], module)) - return result + return results class BehanceUserExtractor(BehanceExtractor): @@ -228,7 +233,7 @@ class BehanceUserExtractor(BehanceExtractor): def __init__(self, match): BehanceExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match[1] def galleries(self): endpoint = "GetProfileProjects" @@ -256,7 +261,7 @@ class BehanceCollectionExtractor(BehanceExtractor): def __init__(self, match): BehanceExtractor.__init__(self, match) - self.collection_id = match.group(1) + self.collection_id = match[1] def galleries(self): endpoint = "GetMoodboardItemsAndRecommendations" diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py index 597ec40..3f0acff 100644 --- a/gallery_dl/extractor/bilibili.py +++ b/gallery_dl/extractor/bilibili.py @@ -19,20 +19,15 @@ class BilibiliExtractor(Extractor): def _init(self): self.api = BilibiliAPI(self) - -class BilibiliUserArticlesExtractor(BilibiliExtractor): - """Extractor for a bilibili user's articles""" - subcategory = "user-articles" - pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)" - r"/(?:article|upload/opus)") - example = "https://space.bilibili.com/12345/article" - def items(self): - for article in self.api.user_articles(self.groups[0]): + for article in self.articles(): article["_extractor"] = BilibiliArticleExtractor - url = "{}/opus/{}".format(self.root, article["opus_id"]) + url = f"{self.root}/opus/{article['opus_id']}" yield Message.Queue, url, article + def articles(self): + return () + class BilibiliArticleExtractor(BilibiliExtractor): """Extractor for a bilibili article""" @@ -45,12 +40,16 @@ class BilibiliArticleExtractor(BilibiliExtractor): archive_fmt = "{id}_{num}" def items(self): - article = self.api.article(self.groups[0]) + article_id = self.groups[0] + article = self.api.article(article_id) # Flatten modules list modules = {} for module in article["detail"]["modules"]: - del module['module_type'] + if module["module_type"] == "MODULE_TYPE_BLOCKED": + self.log.warning("%s: Blocked Article\n%s", article_id, + module["module_blocked"].get("hint_message")) + del module["module_type"] modules.update(module) article["detail"]["modules"] = modules @@ -64,14 +63,15 @@ class BilibiliArticleExtractor(BilibiliExtractor): except Exception: pass - for paragraph in modules['module_content']['paragraphs']: - if "pic" not in paragraph: - continue + if "module_content" in modules: + for paragraph in modules["module_content"]["paragraphs"]: + if "pic" not in paragraph: + continue - try: - pics.extend(paragraph["pic"]["pics"]) - except Exception: - pass + try: + pics.extend(paragraph["pic"]["pics"]) + except Exception: + pass article["count"] = len(pics) yield Message.Directory, article @@ -81,6 +81,17 @@ class BilibiliArticleExtractor(BilibiliExtractor): yield Message.Url, url, text.nameext_from_url(url, article) +class BilibiliUserArticlesExtractor(BilibiliExtractor): + """Extractor for a bilibili user's articles""" + subcategory = "user-articles" + pattern = (r"(?:https?://)?space\.bilibili\.com/(\d+)" + r"/(?:article|upload/opus)") + example = "https://space.bilibili.com/12345/article" + + def articles(self): + return self.api.user_articles(self.groups[0]) + + class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor): subcategory = "user-articles-favorite" pattern = (r"(?:https?://)?space\.bilibili\.com" @@ -88,18 +99,12 @@ class BilibiliUserArticlesFavoriteExtractor(BilibiliExtractor): example = "https://space.bilibili.com/12345/favlist?fid=opus" _warning = True - def _init(self): - BilibiliExtractor._init(self) + def articles(self): if self._warning: if not self.cookies_check(("SESSDATA",)): self.log.error("'SESSDATA' cookie required") BilibiliUserArticlesFavoriteExtractor._warning = False - - def items(self): - for article in self.api.user_favlist(): - article["_extractor"] = BilibiliArticleExtractor - url = "{}/opus/{}".format(self.root, article["opus_id"]) - yield Message.Queue, url, article + return self.api.user_favlist() class BilibiliAPI(): @@ -108,11 +113,11 @@ class BilibiliAPI(): def _call(self, endpoint, params): url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint - data = self.extractor.request(url, params=params).json() + data = self.extractor.request_json(url, params=params) - if data["code"] != 0: + if data["code"]: self.extractor.log.debug("Server response: %s", data) - raise exception.StopExtraction("API request failed") + raise exception.AbortExtraction("API request failed") return data @@ -140,8 +145,8 @@ class BilibiliAPI(): page, "window.__INITIAL_STATE__=", "};") + "}") except Exception: if "window._riskdata_" not in page: - raise exception.StopExtraction( - "%s: Unable to extract INITIAL_STATE data", article_id) + raise exception.AbortExtraction( + f"{article_id}: Unable to extract INITIAL_STATE data") self.extractor.wait(seconds=300) def user_favlist(self): @@ -159,12 +164,13 @@ class BilibiliAPI(): def login_user_id(self): url = "https://api.bilibili.com/x/space/v2/myinfo" - data = self.extractor.request(url).json() + data = self.extractor.request_json(url) if data["code"] != 0: self.extractor.log.debug("Server response: %s", data) - raise exception.StopExtraction("API request failed,Are you login?") + raise exception.AbortExtraction( + "API request failed. Are you logges in?") try: return data["data"]["profile"]["mid"] except Exception: - raise exception.StopExtraction("API request failed") + raise exception.AbortExtraction("API request failed") diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index ef117da..796d9d1 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,12 @@ from .common import BaseExtractor, Message from .. import text, util -import re + + +def original(url): + return (util.re(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)") + .sub(r"\1s0", url) + .replace("http:", "https:", 1)) class BloggerExtractor(BaseExtractor): @@ -33,13 +38,12 @@ class BloggerExtractor(BaseExtractor): blog["date"] = text.parse_datetime(blog["published"]) del blog["selfLink"] - sub = re.compile(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub - findall_image = re.compile( + findall_image = util.re( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall - findall_video = re.compile( + findall_video = util.re( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall metadata = self.metadata() @@ -48,7 +52,7 @@ class BloggerExtractor(BaseExtractor): files = findall_image(content) for idx, url in enumerate(files): - files[idx] = sub(r"\1s0", url).replace("http:", "https:", 1) + files[idx] = original(url) if self.videos and 'id="BLOG_video-' in content: page = self.request(post["url"]).text @@ -98,12 +102,8 @@ class BloggerPostExtractor(BloggerExtractor): pattern = BASE_PATTERN + r"(/\d\d\d\d/\d\d/[^/?#]+\.html)" example = "https://BLOG.blogspot.com/1970/01/TITLE.html" - def __init__(self, match): - BloggerExtractor.__init__(self, match) - self.path = match.group(match.lastindex) - def posts(self, blog): - return (self.api.post_by_path(blog["id"], self.path),) + return (self.api.post_by_path(blog["id"], self.groups[-1]),) class BloggerBlogExtractor(BloggerExtractor): @@ -122,16 +122,13 @@ class BloggerSearchExtractor(BloggerExtractor): pattern = BASE_PATTERN + r"/search/?\?q=([^&#]+)" example = "https://BLOG.blogspot.com/search?q=QUERY" - def __init__(self, match): - BloggerExtractor.__init__(self, match) - self.query = text.unquote(match.group(match.lastindex)) + def metadata(self): + self.query = query = text.unquote(self.groups[-1]) + return {"query": query} def posts(self, blog): return self.api.blog_search(blog["id"], self.query) - def metadata(self): - return {"query": self.query} - class BloggerLabelExtractor(BloggerExtractor): """Extractor for Blogger posts by label""" @@ -139,21 +136,18 @@ class BloggerLabelExtractor(BloggerExtractor): pattern = BASE_PATTERN + r"/search/label/([^/?#]+)" example = "https://BLOG.blogspot.com/search/label/LABEL" - def __init__(self, match): - BloggerExtractor.__init__(self, match) - self.label = text.unquote(match.group(match.lastindex)) + def metadata(self): + self.label = label = text.unquote(self.groups[-1]) + return {"label": label} def posts(self, blog): return self.api.blog_posts(blog["id"], self.label) - def metadata(self): - return {"label": self.label} - class BloggerAPI(): - """Minimal interface for the Blogger v3 API + """Minimal interface for the Blogger API v3 - Ref: https://developers.google.com/blogger + https://developers.google.com/blogger """ API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8" @@ -162,27 +156,27 @@ class BloggerAPI(): self.api_key = extractor.config("api-key") or self.API_KEY def blog_by_url(self, url): - return self._call("blogs/byurl", {"url": url}, "blog") + return self._call("/blogs/byurl", {"url": url}, "blog") def blog_posts(self, blog_id, label=None): - endpoint = "blogs/{}/posts".format(blog_id) + endpoint = f"/blogs/{blog_id}/posts" params = {"labels": label} return self._pagination(endpoint, params) def blog_search(self, blog_id, query): - endpoint = "blogs/{}/posts/search".format(blog_id) + endpoint = f"/blogs/{blog_id}/posts/search" params = {"q": query} return self._pagination(endpoint, params) def post_by_path(self, blog_id, path): - endpoint = "blogs/{}/posts/bypath".format(blog_id) + endpoint = f"/blogs/{blog_id}/posts/bypath" return self._call(endpoint, {"path": path}, "post") def _call(self, endpoint, params, notfound=None): - url = "https://www.googleapis.com/blogger/v3/" + endpoint + url = "https://www.googleapis.com/blogger/v3" + endpoint params["key"] = self.api_key - return self.extractor.request( - url, params=params, notfound=notfound).json() + return self.extractor.request_json( + url, params=params, notfound=notfound) def _pagination(self, endpoint, params): while True: diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 6f4abd5..e2c5334 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://bsky.app/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache, memcache @@ -26,8 +26,7 @@ class BlueskyExtractor(Extractor): root = "https://bsky.app" def _init(self): - meta = self.config("metadata") or () - if meta: + if meta := self.config("metadata") or (): if isinstance(meta, str): meta = meta.replace(" ", "").split(",") elif not isinstance(meta, (list, tuple)): @@ -62,9 +61,8 @@ class BlueskyExtractor(Extractor): yield Message.Directory, post if files: did = post["author"]["did"] - base = ( - "{}/xrpc/com.atproto.sync.getBlob?did={}&cid=".format( - self.api.service_endpoint(did), did)) + base = (f"{self.api.service_endpoint(did)}/xrpc" + f"/com.atproto.sync.getBlob?did={did}&cid=") for post["num"], file in enumerate(files, 1): post.update(file) yield Message.Url, base + file["filename"], post @@ -96,7 +94,7 @@ class BlueskyExtractor(Extractor): uri = record["value"]["subject"]["uri"] if "/app.bsky.feed.post/" in uri: yield from self.api.get_post_thread_uri(uri, depth) - except exception.StopExtraction: + except exception.ControlException: pass # deleted post except Exception as exc: self.log.debug(record, exc_info=exc) @@ -210,16 +208,12 @@ class BlueskyExtractor(Extractor): },) -class BlueskyUserExtractor(BlueskyExtractor): - subcategory = "user" +class BlueskyUserExtractor(Dispatch, BlueskyExtractor): pattern = USER_PATTERN + r"$" example = "https://bsky.app/profile/HANDLE" - def initialize(self): - pass - def items(self): - base = "{}/profile/{}/".format(self.root, self.groups[0]) + base = f"{self.root}/profile/{self.groups[0]}/" default = ("posts" if self.config("quoted", False) or self.config("reposts", False) else "media") return self._dispatch_extractors(( @@ -415,11 +409,9 @@ class BlueskyAPI(): def get_feed(self, actor, feed): endpoint = "app.bsky.feed.getFeed" - params = { - "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor), feed), - "limit": "100", - } + uri = (f"at://{self._did_from_actor(actor)}" + f"/app.bsky.feed.generator/{feed}") + params = {"feed": uri, "limit": "100"} return self._pagination(endpoint, params) def get_follows(self, actor): @@ -432,16 +424,13 @@ class BlueskyAPI(): def get_list_feed(self, actor, list): endpoint = "app.bsky.feed.getListFeed" - params = { - "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor), list), - "limit": "100", - } + uri = f"at://{self._did_from_actor(actor)}/app.bsky.graph.list/{list}" + params = {"list" : uri, "limit": "100"} return self._pagination(endpoint, params) def get_post_thread(self, actor, post_id): - uri = "at://{}/app.bsky.feed.post/{}".format( - self._did_from_actor(actor), post_id) + uri = (f"at://{self._did_from_actor(actor)}" + f"/app.bsky.feed.post/{post_id}") depth = self.extractor.config("depth", "0") return self.get_post_thread_uri(uri, depth) @@ -498,7 +487,7 @@ class BlueskyAPI(): url = "https://plc.directory/" + did try: - data = self.extractor.request(url).json() + data = self.extractor.request_json(url) for service in data["service"]: if service["type"] == "AtprotoPersonalDataServer": return service["serviceEndpoint"] @@ -551,15 +540,15 @@ class BlueskyAPI(): "password" : self.password, } - url = "{}/xrpc/{}".format(self.root, endpoint) + url = f"{self.root}/xrpc/{endpoint}" response = self.extractor.request( url, method="POST", headers=headers, json=data, fatal=None) data = response.json() if response.status_code != 200: self.log.debug("Server response: %s", data) - raise exception.AuthenticationError('"{}: {}"'.format( - data.get("error"), data.get("message"))) + raise exception.AuthenticationError( + f"\"{data.get('error')}: {data.get('message')}\"") _refresh_token_cache.update(self.username, data["refreshJwt"]) return "Bearer " + data["accessJwt"] @@ -567,7 +556,7 @@ class BlueskyAPI(): def _call(self, endpoint, params, root=None): if root is None: root = self.root - url = "{}/xrpc/{}".format(root, endpoint) + url = f"{root}/xrpc/{endpoint}" while True: self.authenticate() @@ -581,16 +570,15 @@ class BlueskyAPI(): self.extractor.wait(until=until) continue + msg = "API request failed" try: data = response.json() - msg = "API request failed ('{}: {}')".format( - data["error"], data["message"]) + msg = f"{msg} ('{data['error']}: {data['message']}')" except Exception: - msg = "API request failed ({} {})".format( - response.status_code, response.reason) + msg = f"{msg} ({response.status_code} {response.reason})" self.extractor.log.debug("Server response: %s", response.text) - raise exception.StopExtraction(msg) + raise exception.AbortExtraction(msg) def _pagination(self, endpoint, params, key="feed", root=None, check_empty=False): diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 7e26f38..3b97e9a 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -27,8 +27,7 @@ class BooruExtractor(BaseExtractor): notes = self.config("notes", False) fetch_html = tags or notes - url_key = self.config("url") - if url_key: + if url_key := self.config("url"): if isinstance(url_key, (list, tuple)): self._file_url = self._file_url_list self._file_url_keys = url_key diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index f3e441b..e0383bf 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -144,8 +144,7 @@ class BoostyExtractor(Extractor): url = block["url"] sep = "&" if "?" in url else "?" - signed_query = post.get("signedQuery") - if signed_query: + if signed_query := post.get("signedQuery"): url += sep + signed_query[1:] sep = "&" @@ -218,7 +217,7 @@ class BoostyFollowingExtractor(BoostyExtractor): def items(self): for user in self.api.user_subscriptions(): - url = "{}/{}".format(self.root, user["blog"]["blogUrl"]) + url = f"{self.root}/{user['blog']['blogUrl']}" user["_extractor"] = BoostyUserExtractor yield Message.Queue, url, user @@ -280,15 +279,14 @@ class BoostyAPI(): } if not access_token: - auth = self.extractor.cookies.get("auth", domain=".boosty.to") - if auth: + if auth := self.extractor.cookies.get("auth", domain=".boosty.to"): access_token = text.extr( auth, "%22accessToken%22%3A%22", "%22") if access_token: self.headers["Authorization"] = "Bearer " + access_token def blog_posts(self, username, params): - endpoint = "/v1/blog/{}/post/".format(username) + endpoint = f"/v1/blog/{username}/post/" params = self._merge_params(params, { "limit" : "5", "offset" : None, @@ -298,7 +296,7 @@ class BoostyAPI(): return self._pagination(endpoint, params) def blog_media_album(self, username, type="all", params=()): - endpoint = "/v1/blog/{}/media_album/".format(username) + endpoint = f"/v1/blog/{username}/media_album/" params = self._merge_params(params, { "type" : type.rstrip("s"), "limit" : "15", @@ -318,7 +316,7 @@ class BoostyAPI(): return posts def post(self, username, post_id): - endpoint = "/v1/blog/{}/post/{}".format(username, post_id) + endpoint = f"/v1/blog/{username}/post/{post_id}" return self._call(endpoint) def feed_posts(self, params=None): @@ -381,7 +379,7 @@ class BoostyAPI(): else: self.extractor.log.debug(response.text) - raise exception.StopExtraction("API request failed") + raise exception.AbortExtraction("API request failed") def _pagination(self, endpoint, params, transform=None, key=None): if "is_only_allowed" not in params and self.extractor.only_allowed: @@ -418,11 +416,11 @@ class BoostyAPI(): params["offset"] = offset def dialog(self, dialog_id): - endpoint = "/v1/dialog/{}".format(dialog_id) + endpoint = f"/v1/dialog/{dialog_id}" return self._call(endpoint) def dialog_messages(self, dialog_id, limit=300, offset=None): - endpoint = "/v1/dialog/{}/message/".format(dialog_id) + endpoint = f"/v1/dialog/{dialog_id}/message/" params = { "limit": limit, "reverse": "true", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 481e962..eba1678 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022-2023 Mike Fährmann +# Copyright 2022-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -61,6 +61,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): category = "bunkr" root = "https://bunkr.si" root_dl = "https://get.bunkrr.su" + root_api = "https://apidl.bunkr.ru" archive_fmt = "{album_id}_{id|id_url}" pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkr.si/a/ID" @@ -76,9 +77,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): endpoint = self.config("endpoint") if not endpoint: - endpoint = self.root_dl + "/api/_001" + endpoint = self.root_api + "/api/_001_v2" elif endpoint[0] == "/": - endpoint = self.root_dl + endpoint + endpoint = self.root_api + endpoint self.endpoint = endpoint self.offset = 0 @@ -123,7 +124,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): pass else: if not DOMAINS: - raise exception.StopExtraction( + raise exception.AbortExtraction( "All Bunkr domains require solving a CF challenge") # select alternative domain @@ -168,7 +169,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): info[-1], "%H:%M:%S %d/%m/%Y") yield file - except exception.StopExtraction: + except exception.ControlException: raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) @@ -180,11 +181,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): referer = self.root_dl + "/file/" + data_id headers = {"Referer": referer, "Origin": self.root_dl} - data = self.request(self.endpoint, method="POST", headers=headers, - json={"id": data_id}).json() + data = self.request_json(self.endpoint, method="POST", headers=headers, + json={"id": data_id}) if data.get("encrypted"): - key = "SECRET_KEY_{}".format(data["timestamp"] // 3600) + key = f"SECRET_KEY_{data['timestamp'] // 3600}" file_url = util.decrypt_xor(data["url"], key.encode()) else: file_url = data["url"] diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py index 6c81f53..22f7a97 100644 --- a/gallery_dl/extractor/catbox.py +++ b/gallery_dl/extractor/catbox.py @@ -26,7 +26,7 @@ class CatboxAlbumExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) return { - "album_id" : self.gallery_url.rpartition("/")[2], + "album_id" : self.page_url.rpartition("/")[2], "album_name" : text.unescape(extr("

", "<")), "date" : text.parse_datetime(extr( "

Created ", "<"), "%B %d %Y"), diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index dc963c5..1da7e23 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2023 Mike Fährmann +# Copyright 2023-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -68,7 +68,7 @@ class CheveretoImageExtractor(CheveretoExtractor): extr('url: "', '"')) if not url or url.endswith("/loading.svg"): pos = page.find(" download=") - url = text.rextract(page, 'href="', '"', pos)[0] + url = text.rextr(page, 'href="', '"', pos) if not url.startswith("https://"): url = util.decrypt_xor( url, b"seltilovessimpcity@simpcityhatesscrapers", diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py index 27d50e7..7dfe6b6 100644 --- a/gallery_dl/extractor/cien.py +++ b/gallery_dl/extractor/cien.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,7 +20,7 @@ class CienExtractor(Extractor): request_interval = (1.0, 2.0) def __init__(self, match): - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(match[0]) Extractor.__init__(self, match) def _init(self): @@ -52,8 +52,7 @@ class CienArticleExtractor(CienExtractor): example = "https://ci-en.net/creator/123/article/12345" def items(self): - url = "{}/creator/{}/article/{}".format( - self.root, self.groups[0], self.groups[1]) + url = f"{self.root}/creator/{self.groups[0]}/article/{self.groups[1]}" page = self.request(url, notfound="article").text files = self._extract_files(page) @@ -121,7 +120,7 @@ class CienArticleExtractor(CienExtractor): auth = text.extr(video, ' auth-key="', '"') file = text.nameext_from_url(name) - file["url"] = "{}video-web.mp4?{}".format(path, auth) + file["url"] = f"{path}video-web.mp4?{auth}" file["type"] = "video" files.append(file) @@ -145,12 +144,12 @@ class CienArticleExtractor(CienExtractor): "gallery_id": text.extr(gallery, ' gallery-id="', '"'), "time" : text.extr(gallery, ' time="', '"'), } - data = self.request(url, params=params).json() + data = self.request_json(url, params=params) url = self.root + "/api/creator/gallery/imagePath" for params["page"], params["file_id"] in enumerate( data["imgList"]): - path = self.request(url, params=params).json()["path"] + path = self.request_json(url, params=params)["path"] file = params.copy() file["url"] = path @@ -163,7 +162,7 @@ class CienCreatorExtractor(CienExtractor): example = "https://ci-en.net/creator/123" def items(self): - url = "{}/creator/{}/article".format(self.root, self.groups[0]) + url = f"{self.root}/creator/{self.groups[0]}/article" params = text.parse_query(self.groups[1]) params["mode"] = "list" return self._pagination_articles(url, params) diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 56fe851..dc5b777 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://www.civitai.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import memcache import itertools @@ -22,9 +22,9 @@ class CivitaiExtractor(Extractor): """Base class for civitai extractors""" category = "civitai" root = "https://civitai.com" - directory_fmt = ("{category}", "{username|user[username]}", "images") - filename_fmt = "{file[id]|id|filename}.{extension}" - archive_fmt = "{file[uuid]|uuid}" + directory_fmt = ("{category}", "{user[username]}", "images") + filename_fmt = "{file[id]}.{extension}" + archive_fmt = "{file[uuid]}" request_interval = (0.5, 1.5) def _init(self): @@ -35,8 +35,7 @@ class CivitaiExtractor(Extractor): self.log.debug("Using tRPC API") self.api = CivitaiTrpcAPI(self) - quality = self.config("quality") - if quality: + if quality := self.config("quality"): if not isinstance(quality, str): quality = ",".join(quality) self._image_quality = quality @@ -45,8 +44,7 @@ class CivitaiExtractor(Extractor): self._image_quality = "original=true" self._image_ext = "png" - quality_video = self.config("quality-videos") - if quality_video: + if quality_video := self.config("quality-videos"): if not isinstance(quality_video, str): quality_video = ",".join(quality_video) if quality_video[0] == "+": @@ -59,28 +57,27 @@ class CivitaiExtractor(Extractor): self._video_quality = "quality=100" self._video_ext = "webm" - metadata = self.config("metadata") - if metadata: + if metadata := self.config("metadata"): if isinstance(metadata, str): metadata = metadata.split(",") elif not isinstance(metadata, (list, tuple)): - metadata = ("generation", "version") + metadata = ("generation", "version", "post") self._meta_generation = ("generation" in metadata) self._meta_version = ("version" in metadata) + self._meta_post = ("post" in metadata) else: - self._meta_generation = self._meta_version = False + self._meta_generation = self._meta_version = self._meta_post = \ + False def items(self): - models = self.models() - if models: + if models := self.models(): data = {"_extractor": CivitaiModelExtractor} for model in models: - url = "{}/models/{}".format(self.root, model["id"]) + url = f"{self.root}/models/{model['id']}" yield Message.Queue, url, data return - posts = self.posts() - if posts: + if posts := self.posts(): for post in posts: if "images" in post: @@ -105,27 +102,37 @@ class CivitaiExtractor(Extractor): yield Message.Url, file["url"], file return - images = self.images() - if images: - for image in images: + if images := self.images(): + for file in images: + + data = { + "file": file, + "user": file.pop("user"), + } if self._meta_generation: - image["generation"] = \ - self._extract_meta_generation(image) + data["generation"] = \ + self._extract_meta_generation(file) if self._meta_version: - image["model"], image["version"] = \ - self._extract_meta_version(image, False) - image["date"] = text.parse_datetime( - image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") - - url = self._url(image) - text.nameext_from_url(url, image) - if not image["extension"]: - image["extension"] = ( - self._video_ext if image.get("type") == "video" else + data["model"], data["version"] = \ + self._extract_meta_version(file, False) + if "post" in file: + data["post"] = file.pop("post") + if self._meta_post and "post" not in data: + data["post"] = post = self._extract_meta_post(file) + if post: + post.pop("user", None) + file["date"] = text.parse_datetime( + file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + data["url"] = url = self._url(file) + text.nameext_from_url(url, data) + if not data["extension"]: + data["extension"] = ( + self._video_ext if file.get("type") == "video" else self._image_ext) - yield Message.Directory, image - yield Message.Url, url, image + yield Message.Directory, data + yield Message.Url, url, data return def models(self): @@ -151,12 +158,13 @@ class CivitaiExtractor(Extractor): image["uuid"] = url name = image.get("name") if not name: - mime = image.get("mimeType") or self._image_ext - name = "{}.{}".format(image.get("id"), mime.rpartition("/")[2]) - return ( - "https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/{}/{}/{}".format( - url, quality, name) - ) + if mime := image.get("mimeType"): + name = f"{image.get('id')}.{mime.rpartition('/')[2]}" + else: + ext = self._video_ext if video else self._image_ext + name = f"{image.get('id')}.{ext}" + return (f"https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA" + f"/{url}/{quality}/{name}") def _image_results(self, images): for num, file in enumerate(images, 1): @@ -171,10 +179,29 @@ class CivitaiExtractor(Extractor): self._image_ext) if "id" not in file and data["filename"].isdecimal(): file["id"] = text.parse_int(data["filename"]) + if "date" not in file: + file["date"] = text.parse_datetime( + file["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") if self._meta_generation: file["generation"] = self._extract_meta_generation(file) yield data + def _image_reactions(self): + self._require_auth() + + params = self.params + params["authed"] = True + params["useIndex"] = False + if "reactions" not in params: + params["reactions"] = ("Like", "Dislike", "Heart", "Laugh", "Cry") + return self.api.images(params) + + def _require_auth(self): + if "Authorization" not in self.api.headers and \ + not self.cookies.get( + "__Secure-civitai-token", domain=".civitai.com"): + raise exception.AuthRequired(("'api-key'", "cookies")) + def _parse_query(self, value): return text.parse_query_list( value, {"tags", "reactions", "baseModels", "tools", "techniques", @@ -186,10 +213,18 @@ class CivitaiExtractor(Extractor): except Exception as exc: return self.log.debug("", exc_info=exc) + def _extract_meta_post(self, image): + try: + post = self.api.post(image["postId"]) + post["date"] = text.parse_datetime( + post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + return post + except Exception as exc: + return self.log.debug("", exc_info=exc) + def _extract_meta_version(self, item, is_post=True): try: - version_id = self._extract_version_id(item, is_post) - if version_id: + if version_id := self._extract_version_id(item, is_post): version = self.api.model_version(version_id).copy() return version.pop("model", None), version except Exception as exc: @@ -197,12 +232,11 @@ class CivitaiExtractor(Extractor): return None, None def _extract_version_id(self, item, is_post=True): - version_id = item.get("modelVersionId") - if version_id: + if version_id := item.get("modelVersionId"): return version_id - - version_ids = item.get("modelVersionIds") - if version_ids: + if version_ids := item.get("modelVersionIds"): + return version_ids[0] + if version_ids := item.get("modelVersionIdsManual"): return version_ids[0] if is_post: @@ -285,16 +319,15 @@ class CivitaiModelExtractor(CivitaiExtractor): if not sep: name = ext ext = "bin" - file["uuid"] = "model-{}-{}-{}".format( - model["id"], version["id"], file["id"]) + file["uuid"] = f"model-{model['id']}-{version['id']}-{file['id']}" files.append({ "num" : num, "file" : file, "filename" : name, "extension": ext, - "url" : (file.get("downloadUrl") or - "{}/api/download/models/{}".format( - self.root, version["id"])), + "url" : ( + file.get("downloadUrl") or + f"{self.root}/api/download/models/{version['id']}"), "_http_headers" : { "Authorization": self.api.headers.get("Authorization")}, "_http_validate": self._validate_file_model, @@ -308,7 +341,7 @@ class CivitaiModelExtractor(CivitaiExtractor): else: params = { "modelVersionId": version["id"], - "prioritizedUserIds": [user["id"]], + "prioritizedUserIds": (user["id"],), "period": "AllTime", "sort": "Most Reactions", "limit": 20, @@ -327,8 +360,7 @@ class CivitaiModelExtractor(CivitaiExtractor): alert = text.extr( response.text, 'mantine-Alert-message">', "") if alert: - msg = "\"{}\" - 'api-key' required".format( - text.remove_html(alert)) + msg = f"\"{text.remove_html(alert)}\" - 'api-key' required" else: msg = "'api-key' required to download this file" self.log.warning(msg) @@ -366,14 +398,26 @@ class CivitaiTagExtractor(CivitaiExtractor): return self.api.models_tag(tag) -class CivitaiSearchExtractor(CivitaiExtractor): - subcategory = "search" +class CivitaiSearchModelsExtractor(CivitaiExtractor): + subcategory = "search-models" pattern = BASE_PATTERN + r"/search/models\?([^#]+)" example = "https://civitai.com/search/models?query=QUERY" def models(self): - params = text.parse_query(self.groups[0]) - return self.api.models(params) + params = self._parse_query(self.groups[0]) + return CivitaiSearchAPI(self).search_models( + params.get("query"), params.get("sortBy"), self.api.nsfw) + + +class CivitaiSearchImagesExtractor(CivitaiExtractor): + subcategory = "search-images" + pattern = BASE_PATTERN + r"/search/images\?([^#]+)" + example = "https://civitai.com/search/images?query=QUERY" + + def images(self): + params = self._parse_query(self.groups[0]) + return CivitaiSearchAPI(self).search_images( + params.get("query"), params.get("sortBy"), self.api.nsfw) class CivitaiModelsExtractor(CivitaiExtractor): @@ -382,7 +426,7 @@ class CivitaiModelsExtractor(CivitaiExtractor): example = "https://civitai.com/models" def models(self): - params = text.parse_query(self.groups[0]) + params = self._parse_query(self.groups[0]) return self.api.models(params) @@ -392,26 +436,32 @@ class CivitaiImagesExtractor(CivitaiExtractor): example = "https://civitai.com/images" def images(self): - params = text.parse_query(self.groups[0]) + params = self._parse_query(self.groups[0]) return self.api.images(params) -class CivitaiUserExtractor(CivitaiExtractor): - subcategory = "user" +class CivitaiPostsExtractor(CivitaiExtractor): + subcategory = "posts" + pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?(?:$|#)" + example = "https://civitai.com/posts" + + def posts(self): + params = self._parse_query(self.groups[0]) + return self.api.posts(params) + + +class CivitaiUserExtractor(Dispatch, CivitaiExtractor): pattern = USER_PATTERN + r"/?(?:$|\?|#)" example = "https://civitai.com/user/USER" - def initialize(self): - pass - def items(self): - base = "{}/user/{}/".format(self.root, self.groups[0]) + base = f"{self.root}/user/{self.groups[0]}/" return self._dispatch_extractors(( (CivitaiUserModelsExtractor, base + "models"), (CivitaiUserPostsExtractor , base + "posts"), (CivitaiUserImagesExtractor, base + "images"), (CivitaiUserVideosExtractor, base + "videos"), - ), ("user-models", "user-posts")) + ), ("user-images", "user-videos")) class CivitaiUserModelsExtractor(CivitaiExtractor): @@ -446,29 +496,17 @@ class CivitaiUserImagesExtractor(CivitaiExtractor): example = "https://civitai.com/user/USER/images" def __init__(self, match): - self.params = self._parse_query(match.group(2)) + user, query = match.groups() + self.params = self._parse_query(query) if self.params.get("section") == "reactions": - self.subcategory = "reactions" - self.images = self.images_reactions + self.subcategory = "reactions-images" + self.images = self._image_reactions + else: + self.params["username"] = text.unquote(user) CivitaiExtractor.__init__(self, match) def images(self): - params = self.params - params["username"] = text.unquote(self.groups[0]) - return self.api.images(params) - - def images_reactions(self): - if "Authorization" not in self.api.headers and \ - not self.cookies.get( - "__Secure-civitai-token", domain=".civitai.com"): - raise exception.AuthorizationError("api-key or cookies required") - - params = self.params - params["authed"] = True - params["useIndex"] = False - if "reactions" not in params: - params["reactions"] = ("Like", "Dislike", "Heart", "Laugh", "Cry") - return self.api.images(params) + return self.api.images(self.params) class CivitaiUserVideosExtractor(CivitaiExtractor): @@ -477,14 +515,40 @@ class CivitaiUserVideosExtractor(CivitaiExtractor): pattern = USER_PATTERN + r"/videos/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/videos" - def images(self): - self._image_ext = "mp4" + def __init__(self, match): + user, query = match.groups() + self.params = self._parse_query(query) + self.params["types"] = ("video",) + if self.params.get("section") == "reactions": + self.subcategory = "reactions-videos" + self.images = self._image_reactions + else: + self.params["username"] = text.unquote(user) + CivitaiExtractor.__init__(self, match) - user, query = self.groups - params = self._parse_query(query) - params["types"] = ["video"] - params["username"] = text.unquote(user) - return self.api.images(params) + images = CivitaiUserImagesExtractor.images + + +class CivitaiGeneratedExtractor(CivitaiExtractor): + """Extractor for your generated files feed""" + subcategory = "generated" + filename_fmt = "{filename}.{extension}" + directory_fmt = ("{category}", "generated") + pattern = f"{BASE_PATTERN}/generate" + example = "https://civitai.com/generate" + + def items(self): + self._require_auth() + + for gen in self.api.orchestrator_queryGeneratedImages(): + gen["date"] = text.parse_datetime( + gen["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + yield Message.Directory, gen + for step in gen.pop("steps", ()): + for image in step.pop("images", ()): + data = {"file": image, **step, **gen} + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, data) class CivitaiRestAPI(): @@ -498,8 +562,7 @@ class CivitaiRestAPI(): self.root = extractor.root + "/api" self.headers = {"Content-Type": "application/json"} - api_key = extractor.config("api-key") - if api_key: + if api_key := extractor.config("api-key"): extractor.log.debug("Using api_key authentication") self.headers["Authorization"] = "Bearer " + api_key @@ -528,12 +591,12 @@ class CivitaiRestAPI(): }) def model(self, model_id): - endpoint = "/v1/models/{}".format(model_id) + endpoint = f"/v1/models/{model_id}" return self._call(endpoint) @memcache(keyarg=1) def model_version(self, model_version_id): - endpoint = "/v1/model-versions/{}".format(model_version_id) + endpoint = f"/v1/model-versions/{model_version_id}" return self._call(endpoint) def models(self, params): @@ -572,13 +635,12 @@ class CivitaiTrpcAPI(): self.root = extractor.root + "/api/trpc/" self.headers = { "content-type" : "application/json", - "x-client-version": "5.0.701", + "x-client-version": "5.0.920", "x-client-date" : "", "x-client" : "web", "x-fingerprint" : "undefined", } - api_key = extractor.config("api-key") - if api_key: + if api_key := extractor.config("api-key"): extractor.log.debug("Using api_key authentication") self.headers["Authorization"] = "Bearer " + api_key @@ -607,11 +669,11 @@ class CivitaiTrpcAPI(): "useIndex" : True, "period" : "AllTime", "sort" : "Newest", - "types" : ["image"], + "types" : ("image",), "withMeta" : False, # Metadata Only "fromPlatform" : False, # Made On-Site "browsingLevel": self.nsfw, - "include" : ["cosmetics"], + "include" : ("cosmetics",), }) params = self._type_params(params) @@ -690,9 +752,10 @@ class CivitaiTrpcAPI(): "followed" : False, "draftOnly" : False, "pending" : True, - "include" : ["cosmetics"], + "include" : ("cosmetics",), }) + params = self._type_params(params) return self._pagination(endpoint, params, meta) def user(self, username): @@ -700,6 +763,15 @@ class CivitaiTrpcAPI(): params = {"username": username} return (self._call(endpoint, params),) + def orchestrator_queryGeneratedImages(self): + endpoint = "orchestrator.queryGeneratedImages" + params = { + "ascending": False, + "tags" : ("gen",), + "authed" : True, + } + return self._pagination(endpoint, params) + def _call(self, endpoint, params, meta=None): url = self.root + endpoint headers = self.headers @@ -765,4 +837,107 @@ class CivitaiTrpcAPI(): def _bool(value): - return True if value == "true" else False + return value == "true" + + +class CivitaiSearchAPI(): + + def __init__(self, extractor): + self.extractor = extractor + self.root = "https://search.civitai.com" + self.headers = { + "Authorization": "Bearer ab8565e5ab8dc2d8f0d4256d204781cb63fe8b031" + "eb3779cbbed38a7b5308e5c", + "Content-Type": "application/json", + "X-Meilisearch-Client": "Meilisearch instant-meilisearch (v0.13.5)" + " ; Meilisearch JavaScript (v0.34.0)", + "Origin": extractor.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + "Priority": "u=4", + } + + def search(self, query, type, facets, nsfw=31): + endpoint = "/multi-search" + + query = { + "q" : query, + "indexUid": type, + "facets" : facets, + "attributesToHighlight": (), + "highlightPreTag" : "__ais-highlight__", + "highlightPostTag": "__/ais-highlight__", + "limit" : 51, + "offset": 0, + "filter": (self._generate_filter(nsfw),), + } + + return self._pagination(endpoint, query) + + def search_models(self, query, type=None, nsfw=31): + facets = ( + "category.name", + "checkpointType", + "fileFormats", + "lastVersionAtUnix", + "tags.name", + "type", + "user.username", + "version.baseModel", + ) + return self.search(query, type or "models_v9", facets, nsfw) + + def search_images(self, query, type=None, nsfw=31): + facets = ( + "aspectRatio", + "baseModel", + "createdAtUnix", + "tagNames", + "techniqueNames", + "toolNames", + "type", + "user.username", + ) + return self.search(query, type or "images_v6", facets, nsfw) + + def _call(self, endpoint, query): + url = self.root + endpoint + params = util.json_dumps({"queries": (query,)}) + + data = self.extractor.request_json( + url, method="POST", headers=self.headers, data=params) + + return data["results"][0] + + def _pagination(self, endpoint, query): + limit = query["limit"] - 1 + threshold = limit // 2 + + while True: + data = self._call(endpoint, query) + + items = data["hits"] + yield from items + + if len(items) < threshold: + return + query["offset"] += limit + + def _generate_filter(self, level): + fltr = [] + + if level & 1: + fltr.append("1") + if level & 2: + fltr.append("2") + if level & 4: + fltr.append("4") + if level & 8: + fltr.append("8") + if level & 16: + fltr.append("16") + + if not fltr: + return "()" + return "(nsfwLevel=" + " OR nsfwLevel=".join(fltr) + ")" diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py new file mode 100644 index 0000000..7ef4607 --- /dev/null +++ b/gallery_dl/extractor/comick.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://comick.io/""" + +from .common import ChapterExtractor, MangaExtractor, Message +from .. import text +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?comick\.io" + + +class ComickBase(): + """Base class for comick.io extractors""" + category = "comick" + root = "https://comick.io" + + @memcache(keyarg=1) + def _manga_info(self, slug): + url = f"{self.root}/comic/{slug}" + page = self.request(url).text + data = self._extract_nextdata(page) + props = data["props"]["pageProps"] + comic = props["comic"] + + genre = [] + theme = [] + format = "" + for item in comic["md_comic_md_genres"]: + item = item["md_genres"] + group = item["group"] + if group == "Genre": + genre.append(item["name"]) + elif group == "Theme": + theme.append(item["name"]) + else: + format = item["name"] + + if mu := comic["mu_comics"]: + tags = [c["mu_categories"]["title"] + for c in mu["mu_comic_categories"]] + publisher = [p["mu_publishers"]["title"] + for p in mu["mu_comic_publishers"]] + else: + tags = publisher = () + + return { + "manga": comic["title"], + "manga_id": comic["id"], + "manga_hid": comic["hid"], + "manga_slug": slug, + "manga_titles": [t["title"] for t in comic["md_titles"]], + "artist": [a["name"] for a in props["artists"]], + "author": [a["name"] for a in props["authors"]], + "genre" : genre, + "theme" : theme, + "format": format, + "tags" : tags, + "publisher": publisher, + "published": text.parse_int(comic["year"]), + "description": comic["desc"], + "demographic": props["demographic"], + "origin": comic["iso639_1"], + "mature": props["matureContent"], + "rating": comic["content_rating"], + "rank" : comic["follow_rank"], + "score" : text.parse_float(comic["bayesian_rating"]), + "status": "Complete" if comic["status"] == 2 else "Ongoing", + "links" : comic["links"], + "_build_id": data["buildId"], + } + + def _chapter_info(self, manga, chstr): + slug = manga['manga_slug'] + url = (f"{self.root}/_next/data/{manga['_build_id']}" + f"/comic/{slug}/{chstr}.json") + params = {"slug": slug, "chapter": chstr} + return self.request_json(url, params=params)["pageProps"] + + +class ComickChapterExtractor(ComickBase, ChapterExtractor): + """Extractor for comick.io manga chapters""" + archive_fmt = "{chapter_hid}_{page}" + pattern = BASE_PATTERN + r"/comic/([\w-]+)/(\w+-chapter-[^/?#]+)" + example = "https://comick.io/comic/MANGA/ID-chapter-123-en" + + def metadata(self, page): + slug, chstr = self.groups + manga = self._manga_info(slug) + props = self._chapter_info(manga, chstr) + + ch = props["chapter"] + self._images = ch["md_images"] + chapter, sep, minor = ch["chap"].partition(".") + + return { + **manga, + "title" : props["chapTitle"], + "volume" : text.parse_int(ch["vol"]), + "chapter" : text.parse_int(chapter), + "chapter_minor" : sep + minor, + "chapter_id" : ch["id"], + "chapter_hid" : ch["hid"], + "chapter_string": chstr, + "group" : ch["group_name"], + "date" : text.parse_datetime( + ch["created_at"][:19], "%Y-%m-%dT%H:%M:%S"), + "date_updated" : text.parse_datetime( + ch["updated_at"][:19], "%Y-%m-%dT%H:%M:%S"), + "lang" : ch["lang"], + } + + def images(self, page): + return [ + ("https://meo.comick.pictures/" + img["b2key"], { + "width" : img["w"], + "height" : img["h"], + "size" : img["s"], + "optimized": img["optimized"], + }) + for img in self._images + ] + + +class ComickMangaExtractor(ComickBase, MangaExtractor): + """Extractor for comick.io manga""" + pattern = BASE_PATTERN + r"/comic/([\w-]+)/?(?:\?([^#]+))?" + example = "https://comick.io/comic/MANGA" + + def items(self): + slug = self.groups[0] + manga = self._manga_info(slug) + + for ch in self.chapters(manga): + url = (f"{self.root}/comic/{slug}" + f"/{ch['hid']}-chapter-{ch['chap']}-{ch['lang']}") + + ch.update(manga) + chapter, sep, minor = ch["chap"].partition(".") + ch["chapter"] = text.parse_int(chapter) + ch["chapter_minor"] = sep + minor + ch["_extractor"] = ComickChapterExtractor + + yield Message.Queue, url, ch + + def chapters(self, manga): + info = True + slug, query = self.groups + + url = f"https://api.comick.io/comic/{manga['manga_hid']}/chapters" + headers = { + "Origin": "https://comick.io", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + + query = text.parse_query(query) + params = {"lang": query.get("lang") or None} + params["page"] = page = text.parse_int(query.get("page"), 1) + + if date_order := query.get("date-order"): + params["date-order"] = date_order + elif chap_order := query.get("chap-order"): + params["chap-order"] = chap_order + else: + params["chap-order"] = \ + "0" if self.config("chapter-reverse", False) else "1" + + group = query.get("group", None) + if group == "0": + group = None + + while True: + data = self.request_json(url, params=params, headers=headers) + limit = data["limit"] + + if info: + info = False + total = data["total"] - limit * page + if total > limit: + self.log.info("Collecting %s chapters", total) + + if group is None: + yield from data["chapters"] + else: + for ch in data["chapters"]: + if group in ch["group_name"]: + yield ch + + if data["total"] <= limit * page: + return + params["page"] = page = page + 1 diff --git a/gallery_dl/extractor/comicvine.py b/gallery_dl/extractor/comicvine.py index d076795..39397b9 100644 --- a/gallery_dl/extractor/comicvine.py +++ b/gallery_dl/extractor/comicvine.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -46,7 +46,7 @@ class ComicvineTagExtractor(BooruExtractor): } while True: - images = self.request(url, params=params).json()["images"] + images = self.request_json(url, params=params)["images"] yield from images if len(images) < self.per_page: @@ -59,8 +59,7 @@ class ComicvineTagExtractor(BooruExtractor): _file_url = operator.itemgetter("original") - @staticmethod - def _prepare(post): + def _prepare(self, post): post["date"] = text.parse_datetime( post["dateCreated"], "%a, %b %d %Y") post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index c430ec1..d46152b 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,9 +17,10 @@ import queue import random import getpass import logging -import datetime import requests import threading +from datetime import datetime +from xml.etree import ElementTree from requests.adapters import HTTPAdapter from .message import Message from .. import config, output, text, util, cache, exception @@ -35,6 +36,7 @@ class Extractor(): directory_fmt = ("{category}",) filename_fmt = "{filename}.{extension}" archive_fmt = "" + status = 0 root = "" cookies_domain = "" cookies_index = 0 @@ -53,6 +55,15 @@ class Extractor(): self.url = match.string self.match = match self.groups = match.groups() + self.kwdict = {} + + if self.category in CATEGORY_MAP: + catsub = f"{self.category}:{self.subcategory}" + if catsub in CATEGORY_MAP: + self.category, self.subcategory = CATEGORY_MAP[catsub] + else: + self.category = CATEGORY_MAP[self.category] + self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -125,11 +136,10 @@ class Extractor(): if first: first = False values = config.accumulate(extr + path, key) - else: - conf = config.get(extr, path[0]) - if conf: - values[:0] = config.accumulate( - (self.subcategory,), key, conf=conf) + elif conf := config.get(extr, path[0]): + values[:0] = config.accumulate( + (self.subcategory,), key, conf=conf) + return values def request(self, url, method="GET", session=None, @@ -149,17 +159,15 @@ class Extractor(): kwargs["verify"] = self._verify if "json" in kwargs: - json = kwargs["json"] - if json is not None: + if (json := kwargs["json"]) is not None: kwargs["data"] = util.json_dumps(json).encode() del kwargs["json"] - headers = kwargs.get("headers") - if headers: + if headers := kwargs.get("headers"): headers["Content-Type"] = "application/json" else: kwargs["headers"] = {"Content-Type": "application/json"} - response = None + response = challenge = None tries = 1 if self._interval: @@ -172,21 +180,22 @@ class Extractor(): try: response = session.request(method, url, **kwargs) except requests.exceptions.ConnectionError as exc: - code = 0 try: reason = exc.args[0].reason cls = reason.__class__.__name__ pre, _, err = str(reason.args[-1]).partition(":") - msg = " {}: {}".format(cls, (err or pre).lstrip()) + msg = f" {cls}: {(err or pre).lstrip()}" except Exception: msg = exc + code = 0 except (requests.exceptions.Timeout, requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc code = 0 except (requests.exceptions.RequestException) as exc: - raise exception.HttpError(exc) + msg = exc + break else: code = response.status_code if self._write_pages: @@ -201,10 +210,10 @@ class Extractor(): response.encoding = encoding return response if notfound and code == 404: + self.status |= exception.NotFoundError.code raise exception.NotFoundError(notfound) - msg = "'{} {}' for '{}'".format( - code, response.reason, response.url) + msg = f"'{code} {response.reason}' for '{response.url}'" challenge = util.detect_challenge(response) if challenge is not None: @@ -238,13 +247,59 @@ class Extractor(): self.sleep(seconds, "retry") tries += 1 - raise exception.HttpError(msg, response) + if not fatal or fatal is ...: + self.log.warning(msg) + return util.NullResponse(url, msg) + + if challenge is None: + exc = exception.HttpError(msg, response) + else: + exc = exception.ChallengeError(challenge, response) + self.status |= exc.code + raise exc def request_location(self, url, **kwargs): kwargs.setdefault("method", "HEAD") kwargs.setdefault("allow_redirects", False) return self.request(url, **kwargs).headers.get("location", "") + def request_json(self, url, **kwargs): + response = self.request(url, **kwargs) + + try: + return util.json_loads(response.text) + except Exception as exc: + fatal = kwargs.get("fatal", True) + if not fatal or fatal is ...: + if challenge := util.detect_challenge(response): + self.log.warning(challenge) + else: + self.log.warning("%s: %s", exc.__class__.__name__, exc) + return {} + raise + + def request_xml(self, url, xmlns=True, **kwargs): + response = self.request(url, **kwargs) + + if xmlns: + text = response.text + else: + text = response.text.replace(" xmlns=", " ns=") + + parser = ElementTree.XMLParser() + try: + parser.feed(text) + return parser.close() + except Exception as exc: + fatal = kwargs.get("fatal", True) + if not fatal or fatal is ...: + if challenge := util.detect_challenge(response): + self.log.warning(challenge) + else: + self.log.warning("%s: %s", exc.__class__.__name__, exc) + return ElementTree.Element("") + raise + _handle_429 = util.false def wait(self, seconds=None, until=None, adjust=1.0, @@ -255,7 +310,7 @@ class Extractor(): seconds = float(seconds) until = now + seconds elif until: - if isinstance(until, datetime.datetime): + if isinstance(until, datetime): # convert to UTC timestamp until = util.datetime_to_timestamp(until) else: @@ -269,8 +324,8 @@ class Extractor(): return if reason: - t = datetime.datetime.fromtimestamp(until).time() - isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) + t = datetime.fromtimestamp(until).time() + isotime = f"{t.hour:02}:{t.minute:02}:{t.second:02}" self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) @@ -295,8 +350,8 @@ class Extractor(): if input is None: input = output.TTY_STDIN if not input: - raise exception.StopExtraction( - "User input required (%s)", prompt.strip(" :")) + raise exception.AbortExtraction( + f"User input required ({prompt.strip(' :')})") def _get_auth_info(self): """Return authentication information as (username, password) tuple""" @@ -366,36 +421,31 @@ class Extractor(): elif platform == "linux": platform = "X11; Linux x86_64" elif platform == "macos": - platform = "Macintosh; Intel Mac OS X 11.5" + platform = "Macintosh; Intel Mac OS X 15.5" if browser == "chrome": if platform.startswith("Macintosh"): - platform = platform.replace(".", "_") + "_2" + platform = platform.replace(".", "_") else: browser = "firefox" - for key, value in HTTP_HEADERS[browser]: + for key, value in HEADERS[browser]: if value and "{}" in value: - headers[key] = value.format(platform) + headers[key] = value.replace("{}", platform) else: headers[key] = value ssl_options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) - ssl_ciphers = SSL_CIPHERS[browser] + ssl_ciphers = CIPHERS[browser] else: - useragent = self.config("user-agent") - if useragent is None or useragent == "auto": - useragent = self.useragent - elif useragent == "browser": - useragent = _browser_useragent() - elif self.useragent is not Extractor.useragent and \ - useragent is config.get(("extractor",), "user-agent"): - useragent = self.useragent - headers["User-Agent"] = useragent + headers["User-Agent"] = self.useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" + ssl_ciphers = self.ciphers + if ssl_ciphers is not None and ssl_ciphers in CIPHERS: + ssl_ciphers = CIPHERS[ssl_ciphers] if BROTLI: headers["Accept-Encoding"] = "gzip, deflate, br" @@ -404,26 +454,40 @@ class Extractor(): if ZSTD: headers["Accept-Encoding"] += ", zstd" - referer = self.config("referer", self.referer) - if referer: + if referer := self.config("referer", self.referer): if isinstance(referer, str): headers["Referer"] = referer elif self.root: headers["Referer"] = self.root + "/" - custom_headers = self.config("headers") - if custom_headers: + custom_ua = self.config("user-agent") + if custom_ua is None or custom_ua == "auto": + pass + elif custom_ua == "browser": + headers["User-Agent"] = _browser_useragent() + elif self.useragent is Extractor.useragent and not self.browser or \ + custom_ua is not config.get(("extractor",), "user-agent"): + headers["User-Agent"] = custom_ua + + if custom_headers := self.config("headers"): + if isinstance(custom_headers, str): + if custom_headers in HEADERS: + custom_headers = HEADERS[custom_headers] + else: + self.log.error("Invalid 'headers' value '%s'", + custom_headers) + custom_headers = () headers.update(custom_headers) - custom_ciphers = self.config("ciphers") - if custom_ciphers: + if custom_ciphers := self.config("ciphers"): if isinstance(custom_ciphers, list): ssl_ciphers = ":".join(custom_ciphers) + elif custom_ciphers in CIPHERS: + ssl_ciphers = CIPHERS[custom_ciphers] else: ssl_ciphers = custom_ciphers - source_address = self.config("source-address") - if source_address: + if source_address := self.config("source-address"): if isinstance(source_address, str): source_address = (source_address, 0) else: @@ -436,8 +500,17 @@ class Extractor(): ssl_options |= ssl.OP_NO_TLSv1_2 self.log.debug("TLS 1.2 disabled.") + if self.config("truststore"): + try: + from truststore import SSLContext as ssl_ctx + except ImportError as exc: + self.log.error("%s: %s", exc.__class__.__name__, exc) + ssl_ctx = None + else: + ssl_ctx = None + adapter = _build_requests_adapter( - ssl_options, ssl_ciphers, source_address) + ssl_options, ssl_ciphers, ssl_ctx, source_address) session.mount("https://", adapter) session.mount("http://", adapter) @@ -448,10 +521,8 @@ class Extractor(): if self.cookies_domain is None: return - cookies = self.config("cookies") - if cookies: - select = self.config("cookies-select") - if select: + if cookies := self.config("cookies"): + if select := self.config("cookies-select"): if select == "rotate": cookies = cookies[self.cookies_index % len(cookies)] Extractor.cookies_index += 1 @@ -469,9 +540,11 @@ class Extractor(): with open(path) as fp: cookies = util.cookiestxt_load(fp) except Exception as exc: - self.log.warning("cookies: %s", exc) + self.log.warning("cookies: Failed to load '%s' (%s: %s)", + cookies_source, exc.__class__.__name__, exc) else: - self.log.debug("Loading cookies from '%s'", cookies_source) + self.log.debug("cookies: Loading cookies from '%s'", + cookies_source) set_cookie = self.cookies.set_cookie for cookie in cookies: set_cookie(cookie) @@ -479,7 +552,7 @@ class Extractor(): elif isinstance(cookies_source, (list, tuple)): key = tuple(cookies_source) - cookies = _browser_cookies.get(key) + cookies = CACHE_COOKIES.get(key) if cookies is None: from ..cookies import load_cookies @@ -489,18 +562,18 @@ class Extractor(): self.log.warning("cookies: %s", exc) cookies = () else: - _browser_cookies[key] = cookies + CACHE_COOKIES[key] = cookies else: - self.log.debug("Using cached cookies from %s", key) + self.log.debug("cookies: Using cached cookies from %s", key) set_cookie = self.cookies.set_cookie for cookie in cookies: set_cookie(cookie) else: - self.log.warning( - "Expected 'dict', 'list', or 'str' value for 'cookies' " - "option, got '%s' (%s)", + self.log.error( + "cookies: Expected 'dict', 'list', or 'str' value for " + "'cookies' option, got '%s' instead (%r)", cookies_source.__class__.__name__, cookies_source) def cookies_store(self): @@ -522,7 +595,8 @@ class Extractor(): util.cookiestxt_store(fp, self.cookies) os.replace(path_tmp, path) except OSError as exc: - self.log.warning("cookies: %s", exc) + self.log.error("cookies: Failed to write to '%s' " + "(%s: %s)", path, exc.__class__.__name__, exc) def cookies_update(self, cookies, domain=""): """Update the session's cookiejar with 'cookies'""" @@ -568,14 +642,17 @@ class Extractor(): if diff <= 0: self.log.warning( - "Cookie '%s' has expired", cookie.name) + "cookies: %s/%s expired at %s", + cookie.domain.lstrip("."), cookie.name, + datetime.fromtimestamp(cookie.expires)) continue elif diff <= 86400: hours = diff // 3600 self.log.warning( - "Cookie '%s' will expire in less than %s hour%s", - cookie.name, hours + 1, "s" if hours else "") + "cookies: %s/%s will expire in less than %s hour%s", + cookie.domain.lstrip("."), cookie.name, + hours + 1, "s" if hours else "") names.discard(cookie.name) if not names: @@ -590,11 +667,6 @@ class Extractor(): return util.json_loads(text.extr( page, ' id="__NEXT_DATA__" type="application/json">', "")) - def _prepare_ddosguard_cookies(self): - if not self.cookies.get("__ddg2", domain=self.cookies_domain): - self.cookies.set( - "__ddg2", util.generate_token(), domain=self.cookies_domain) - def _cache(self, func, maxage, keyarg=None): # return cache.DatabaseCacheDecorator(func, maxage, keyarg) return cache.DatabaseCacheDecorator(func, keyarg, maxage) @@ -608,7 +680,7 @@ class Extractor(): ts = self.config(key, default) if isinstance(ts, str): try: - ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) + ts = int(datetime.strptime(ts, fmt).timestamp()) except ValueError as exc: self.log.warning("Unable to parse '%s': %s", key, exc) ts = default @@ -616,35 +688,12 @@ class Extractor(): fmt = self.config("date-format", "%Y-%m-%dT%H:%M:%S") return get("date-min", dmin), get("date-max", dmax) - def _dispatch_extractors(self, extractor_data, default=()): - """ """ - extractors = { - data[0].subcategory: data - for data in extractor_data - } - - include = self.config("include", default) or () - if include == "all": - include = extractors - elif isinstance(include, str): - include = include.replace(" ", "").split(",") - - result = [(Message.Version, 1)] - for category in include: - try: - extr, url = extractors[category] - except KeyError: - self.log.warning("Invalid include '%s'", category) - else: - result.append((Message.Queue, url, {"_extractor": extr})) - return iter(result) - @classmethod def _dump(cls, obj): util.dump_json(obj, ensure_ascii=False, indent=2) def _dump_response(self, response, history=True): - """Write the response content to a .dump file in the current directory. + """Write the response content to a .txt file in the current directory. The file name is derived from the response url, replacing special characters with "_" @@ -657,12 +706,11 @@ class Extractor(): Extractor._dump_index += 1 else: Extractor._dump_index = 1 - Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub + Extractor._dump_sanitize = util.re_compile( + r"[\\\\|/<>:\"?*&=#]+").sub - fname = "{:>02}_{}".format( - Extractor._dump_index, - Extractor._dump_sanitize('_', response.url), - ) + fname = (f"{Extractor._dump_index:>02}_" + f"{Extractor._dump_sanitize('_', response.url)}") if util.WINDOWS: path = os.path.abspath(fname)[:255] @@ -693,19 +741,24 @@ class GalleryExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.gallery_url = self.root + self.groups[0] if url is None else url + + if url is None and (path := self.groups[0]) and path[0] == "/": + self.page_url = f"{self.root}{path}" + else: + self.page_url = url def items(self): self.login() - if self.gallery_url: + if self.page_url: page = self.request( - self.gallery_url, notfound=self.subcategory).text + self.page_url, notfound=self.subcategory).text else: page = None data = self.metadata(page) imgs = self.images(page) + assets = self.assets(page) if "count" in data: if self.config("page-reverse"): @@ -727,7 +780,18 @@ class GalleryExtractor(Extractor): images = enum(imgs, 1) yield Message.Directory, data - for data[self.enum], (url, imgdata) in images: + enum_key = self.enum + + if assets: + for asset in assets: + url = asset["url"] + asset.update(data) + asset[enum_key] = 0 + if "extension" not in asset: + text.nameext_from_url(url, asset) + yield Message.Url, url, asset + + for data[enum_key], (url, imgdata) in images: if imgdata: data.update(imgdata) if "extension" not in imgdata: @@ -743,7 +807,13 @@ class GalleryExtractor(Extractor): """Return a dict with general metadata""" def images(self, page): - """Return a list of all (image-url, metadata)-tuples""" + """Return a list or iterable of all (image-url, metadata)-tuples""" + + def assets(self, page): + """Return an iterable of additional gallery assets + + Each asset must be a 'dict' containing at least 'url' and 'type' + """ class ChapterExtractor(GalleryExtractor): @@ -768,7 +838,11 @@ class MangaExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.manga_url = self.root + self.groups[0] if url is None else url + + if url is None and (path := self.groups[0]) and path[0] == "/": + self.page_url = f"{self.root}{path}" + else: + self.page_url = url if self.config("chapter-reverse", False): self.reverse = not self.reverse @@ -776,8 +850,8 @@ class MangaExtractor(Extractor): def items(self): self.login() - if self.manga_url: - page = self.request(self.manga_url, notfound=self.subcategory).text + if self.page_url: + page = self.request(self.page_url, notfound=self.subcategory).text else: page = None @@ -796,6 +870,45 @@ class MangaExtractor(Extractor): """Return a list of all (chapter-url, metadata)-tuples""" +class Dispatch(): + subcategory = "user" + cookies_domain = None + finalize = Extractor.finalize + skip = Extractor.skip + + def __iter__(self): + return self.items() + + def initialize(self): + pass + + def _dispatch_extractors(self, extractor_data, default=(), alt=None): + extractors = { + data[0].subcategory: data + for data in extractor_data + } + + if alt is not None: + for sub, sub_alt in alt: + extractors[sub_alt] = extractors[sub] + + include = self.config("include", default) or () + if include == "all": + include = extractors + elif isinstance(include, str): + include = include.replace(" ", "").split(",") + + results = [(Message.Version, 1)] + for category in include: + try: + extr, url = extractors[category] + except KeyError: + self.log.warning("Invalid include '%s'", category) + else: + results.append((Message.Queue, url, {"_extractor": extr})) + return iter(results) + + class AsynchronousMixin(): """Run info extraction in a separate thread""" @@ -846,7 +959,7 @@ class BaseExtractor(Extractor): if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(self.match.group(0)) + self.root = text.root_from_url(self.match[0]) self.config_instance = info.get else: self.root = group @@ -855,8 +968,7 @@ class BaseExtractor(Extractor): @classmethod def update(cls, instances): - extra_instances = config.get(("extractor",), cls.basecategory) - if extra_instances: + if extra_instances := config.get(("extractor",), cls.basecategory): for category, info in extra_instances.items(): if isinstance(info, dict) and "root" in info: instances[category] = info @@ -864,8 +976,7 @@ class BaseExtractor(Extractor): pattern_list = [] instance_list = cls.instances = [] for category, info in instances.items(): - root = info["root"] - if root: + if root := info["root"]: root = root.rstrip("/") instance_list.append((category, root, info)) @@ -898,24 +1009,35 @@ class RequestsAdapter(HTTPAdapter): return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) -def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): - key = (ssl_options, ssl_ciphers, source_address) +def _build_requests_adapter( + ssl_options, ssl_ciphers, ssl_ctx, source_address): + + key = (ssl_options, ssl_ciphers, ssl_ctx, source_address) try: - return _adapter_cache[key] + return CACHE_ADAPTERS[key] except KeyError: pass - if ssl_options or ssl_ciphers: - ssl_context = urllib3.connection.create_urllib3_context( - options=ssl_options or None, ciphers=ssl_ciphers) - if not requests.__version__ < "2.32": - # https://github.com/psf/requests/pull/6731 - ssl_context.load_verify_locations(requests.certs.where()) + if ssl_options or ssl_ciphers or ssl_ctx: + if ssl_ctx is None: + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + if not requests.__version__ < "2.32": + # https://github.com/psf/requests/pull/6731 + ssl_context.load_verify_locations(requests.certs.where()) + else: + ssl_ctx_orig = urllib3.util.ssl_.SSLContext + try: + urllib3.util.ssl_.SSLContext = ssl_ctx + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + finally: + urllib3.util.ssl_.SSLContext = ssl_ctx_orig ssl_context.check_hostname = False else: ssl_context = None - adapter = _adapter_cache[key] = RequestsAdapter( + adapter = CACHE_ADAPTERS[key] = RequestsAdapter( ssl_context, source_address) return adapter @@ -932,7 +1054,7 @@ def _browser_useragent(): server.listen(1) host, port = server.getsockname() - webbrowser.open("http://{}:{}/user-agent".format(host, port)) + webbrowser.open(f"http://{host}:{port}/user-agent") client = server.accept()[0] server.close() @@ -951,83 +1073,131 @@ def _browser_useragent(): return useragent.decode() -_adapter_cache = {} -_browser_cookies = {} - - -HTTP_HEADERS = { - "firefox": ( - ("User-Agent", "Mozilla/5.0 ({}; " - "rv:128.0) Gecko/20100101 Firefox/128.0"), - ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), - ("Accept-Language", "en-US,en;q=0.5"), - ("Accept-Encoding", None), - ("Referer", None), - ("Connection", "keep-alive"), - ("Upgrade-Insecure-Requests", "1"), - ("Cookie", None), - ("Sec-Fetch-Dest", "empty"), - ("Sec-Fetch-Mode", "no-cors"), - ("Sec-Fetch-Site", "same-origin"), - ("TE", "trailers"), - ), - "chrome": ( - ("Connection", "keep-alive"), - ("Upgrade-Insecure-Requests", "1"), - ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " - "like Gecko) Chrome/111.0.0.0 Safari/537.36"), - ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,image/apng,*/*;q=0.8," - "application/signed-exchange;v=b3;q=0.7"), - ("Referer", None), - ("Sec-Fetch-Site", "same-origin"), - ("Sec-Fetch-Mode", "no-cors"), - ("Sec-Fetch-Dest", "empty"), - ("Accept-Encoding", None), - ("Accept-Language", "en-US,en;q=0.9"), - ("cookie", None), - ("content-length", None), - ), +CACHE_ADAPTERS = {} +CACHE_COOKIES = {} +CATEGORY_MAP = () + + +HEADERS_FIREFOX_140 = ( + ("User-Agent", "Mozilla/5.0 ({}; rv:140.0) Gecko/20100101 Firefox/140.0"), + ("Accept", "text/html,application/xhtml+xml," + "application/xml;q=0.9,*/*;q=0.8"), + ("Accept-Language", "en-US,en;q=0.5"), + ("Accept-Encoding", None), + ("Connection", "keep-alive"), + ("Content-Type", None), + ("Content-Length", None), + ("Referer", None), + ("Origin", None), + ("Cookie", None), + ("Sec-Fetch-Dest", "empty"), + ("Sec-Fetch-Mode", "cors"), + ("Sec-Fetch-Site", "same-origin"), + ("TE", "trailers"), +) +HEADERS_FIREFOX_128 = ( + ("User-Agent", "Mozilla/5.0 ({}; rv:128.0) Gecko/20100101 Firefox/128.0"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), + ("Accept-Language", "en-US,en;q=0.5"), + ("Accept-Encoding", None), + ("Referer", None), + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("Cookie", None), + ("Sec-Fetch-Dest", "empty"), + ("Sec-Fetch-Mode", "no-cors"), + ("Sec-Fetch-Site", "same-origin"), + ("TE", "trailers"), +) +HEADERS_CHROMIUM_138 = ( + ("Connection", "keep-alive"), + ("sec-ch-ua", '"Not)A;Brand";v="8", "Chromium";v="138"'), + ("sec-ch-ua-mobile", "?0"), + ("sec-ch-ua-platform", '"Linux"'), + ("Upgrade-Insecure-Requests", "1"), + ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/138.0.0.0 Safari/537.36"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7"), + ("Referer", None), + ("Sec-Fetch-Site", "same-origin"), + ("Sec-Fetch-Mode", "no-cors"), + # ("Sec-Fetch-User", "?1"), + ("Sec-Fetch-Dest", "empty"), + ("Accept-Encoding", None), + ("Accept-Language", "en-US,en;q=0.9"), +) +HEADERS_CHROMIUM_111 = ( + ("Connection", "keep-alive"), + ("Upgrade-Insecure-Requests", "1"), + ("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/111.0.0.0 Safari/537.36"), + ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.7"), + ("Referer", None), + ("Sec-Fetch-Site", "same-origin"), + ("Sec-Fetch-Mode", "no-cors"), + ("Sec-Fetch-Dest", "empty"), + ("Accept-Encoding", None), + ("Accept-Language", "en-US,en;q=0.9"), + ("cookie", None), + ("content-length", None), +) +HEADERS = { + "firefox" : HEADERS_FIREFOX_140, + "firefox/140": HEADERS_FIREFOX_140, + "firefox/128": HEADERS_FIREFOX_128, + "chrome" : HEADERS_CHROMIUM_138, + "chrome/138" : HEADERS_CHROMIUM_138, + "chrome/111" : HEADERS_CHROMIUM_111, } -SSL_CIPHERS = { - "firefox": ( - "TLS_AES_128_GCM_SHA256:" - "TLS_CHACHA20_POLY1305_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-AES256-SHA:" - "ECDHE-ECDSA-AES128-SHA:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "AES128-GCM-SHA256:" - "AES256-GCM-SHA384:" - "AES128-SHA:" - "AES256-SHA" - ), - "chrome": ( - "TLS_AES_128_GCM_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "TLS_CHACHA20_POLY1305_SHA256:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "AES128-GCM-SHA256:" - "AES256-GCM-SHA384:" - "AES128-SHA:" - "AES256-SHA" - ), +CIPHERS_FIREFOX = ( + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA" +) +CIPHERS_CHROMIUM = ( + "TLS_AES_128_GCM_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "TLS_CHACHA20_POLY1305_SHA256:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA" +) +CIPHERS = { + "firefox" : CIPHERS_FIREFOX, + "firefox/140": CIPHERS_FIREFOX, + "firefox/128": CIPHERS_FIREFOX, + "chrome" : CIPHERS_CHROMIUM, + "chrome/138" : CIPHERS_CHROMIUM, + "chrome/111" : CIPHERS_CHROMIUM, } diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index e150829..b3944f7 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -32,7 +32,7 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): yield Message.Url, file["url"], file def fetch_album(self, album_id): - url = "{}/a/{}".format(self.root, album_id) + url = f"{self.root}/a/{album_id}" page = self.request(url).text extr = text.extract_from(page) @@ -60,9 +60,9 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): def _extract_files(self, file_ids): for file_id in file_ids: try: - url = "{}/api/file/info/{}".format(self.root_api, file_id) - file = self.request(url).json() - auth = self.request(file["auth_url"]).json() + url = f"{self.root_api}/api/file/info/{file_id}" + file = self.request_json(url) + auth = self.request_json(file["auth_url"]) file["url"] = auth["url"] except Exception as exc: self.log.warning("%s (%s: %s)", diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 06c31b9..ff071c5 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -49,8 +49,7 @@ class DanbooruExtractor(BaseExtractor): def items(self): # 'includes' initialization must be done here and not in '_init()' # or it'll cause an exception with e621 when 'metadata' is enabled - includes = self.config("metadata") - if includes: + if includes := self.config("metadata"): if isinstance(includes, (list, tuple)): includes = ",".join(includes) elif not isinstance(includes, str): @@ -112,8 +111,7 @@ class DanbooruExtractor(BaseExtractor): def items_artists(self): for artist in self.artists(): artist["_extractor"] = DanbooruTagExtractor - url = "{}/posts?tags={}".format( - self.root, text.quote(artist["name"])) + url = f"{self.root}/posts?tags={text.quote(artist['name'])}" yield Message.Queue, url, artist def metadata(self): @@ -129,7 +127,7 @@ class DanbooruExtractor(BaseExtractor): first = True while True: - posts = self.request(url, params=params).json() + posts = self.request_json(url, params=params) if isinstance(posts, dict): posts = posts["posts"] @@ -142,8 +140,7 @@ class DanbooruExtractor(BaseExtractor): } data = { meta["id"]: meta - for meta in self.request( - url, params=params_meta).json() + for meta in self.request_json(url, params=params_meta) } for post in posts: post.update(data[post["id"]]) @@ -157,7 +154,7 @@ class DanbooruExtractor(BaseExtractor): return if prefix: - params["page"] = "{}{}".format(prefix, posts[-1]["id"]) + params["page"] = f"{prefix}{posts[-1]['id']}" elif params["page"]: params["page"] += 1 else: @@ -165,11 +162,17 @@ class DanbooruExtractor(BaseExtractor): first = False def _ugoira_frames(self, post): - data = self.request("{}/posts/{}.json?only=media_metadata".format( - self.root, post["id"]) - ).json()["media_metadata"]["metadata"] + data = self.request_json( + f"{self.root}/posts/{post['id']}.json?only=media_metadata" + )["media_metadata"]["metadata"] + + if "Ugoira:FrameMimeType" in data: + ext = data["Ugoira:FrameMimeType"].rpartition("/")[2] + if ext == "jpeg": + ext = "jpg" + else: + ext = data["ZIP:ZipFileName"].rpartition(".")[2] - ext = data["ZIP:ZipFileName"].rpartition(".")[2] fmt = ("{:>06}." + ext).format delays = data["Ugoira:FrameDelays"] return [{"file": fmt(index), "delay": delay} @@ -180,15 +183,15 @@ class DanbooruExtractor(BaseExtractor): order = self.config("order-posts") if not order or order in {"asc", "pool", "pool_asc", "asc_pool"}: - params = {"tags": "ord{}:{}".format(ctype, cid)} + params = {"tags": f"ord{ctype}:{cid}"} elif order in {"id", "desc_id", "id_desc"}: - params = {"tags": "{}:{}".format(ctype, cid)} + params = {"tags": f"{ctype}:{cid}"} prefix = "b" elif order in {"desc", "desc_pool", "pool_desc"}: - params = {"tags": "ord{}:{}".format(ctype, cid)} + params = {"tags": f"ord{ctype}:{cid}"} reverse = True elif order in {"asc_id", "id_asc"}: - params = {"tags": "{}:{}".format(ctype, cid)} + params = {"tags": f"{ctype}:{cid}"} reverse = True posts = self._pagination("/posts.json", params, prefix) @@ -199,8 +202,8 @@ class DanbooruExtractor(BaseExtractor): return self._collection_enumerate(posts) def _collection_metadata(self, cid, ctype, cname=None): - url = "{}/{}s/{}.json".format(self.root, cname or ctype, cid) - collection = self.request(url).json() + url = f"{self.root}/{cname or ctype}s/{cid}.json" + collection = self.request_json(url) collection["name"] = collection["name"].replace("_", " ") self.post_ids = collection.pop("post_ids", ()) return {ctype: collection} @@ -315,11 +318,11 @@ class DanbooruPostExtractor(DanbooruExtractor): example = "https://danbooru.donmai.us/posts/12345" def posts(self): - url = "{}/posts/{}.json".format(self.root, self.groups[-1]) - post = self.request(url).json() + url = f"{self.root}/posts/{self.groups[-1]}.json" + post = self.request_json(url) if self.includes: params = {"only": self.includes} - post.update(self.request(url, params=params).json()) + post.update(self.request_json(url, params=params)) return (post,) @@ -357,8 +360,8 @@ class DanbooruArtistExtractor(DanbooruExtractor): items = DanbooruExtractor.items_artists def artists(self): - url = "{}/artists/{}.json".format(self.root, self.groups[-1]) - return (self.request(url).json(),) + url = f"{self.root}/artists/{self.groups[-1]}.json" + return (self.request_json(url),) class DanbooruArtistSearchExtractor(DanbooruExtractor): @@ -375,7 +378,7 @@ class DanbooruArtistSearchExtractor(DanbooruExtractor): params["page"] = text.parse_int(params.get("page"), 1) while True: - artists = self.request(url, params=params).json() + artists = self.request_json(url, params=params) yield from artists diff --git a/gallery_dl/extractor/dankefuerslesen.py b/gallery_dl/extractor/dankefuerslesen.py new file mode 100644 index 0000000..a2b0f42 --- /dev/null +++ b/gallery_dl/extractor/dankefuerslesen.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://danke.moe/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, util +from ..cache import memcache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?danke\.moe" + + +class DankefuerslesenBase(): + """Base class for dankefuerslesen extractors""" + category = "dankefuerslesen" + root = "https://danke.moe" + + @memcache(keyarg=1) + def _manga_info(self, slug): + url = f"{self.root}/api/series/{slug}/" + return self.request_json(url) + + +class DankefuerslesenChapterExtractor(DankefuerslesenBase, ChapterExtractor): + """Extractor for Danke fürs Lesen manga chapters""" + pattern = BASE_PATTERN + r"/read/manga/([\w-]+)/([\w-]+)" + example = "https://danke.moe/read/manga/TITLE/123/1/" + + def _init(self): + self.zip = self.config("zip", False) + if self.zip: + self.filename_fmt = f"{self.directory_fmt[-1]}.{{extension}}" + self.directory_fmt = self.directory_fmt[:-1] + + def metadata(self, page): + slug, ch = self.groups + manga = self._manga_info(slug) + + if "-" in ch: + chapter, sep, minor = ch.rpartition("-") + ch = ch.replace("-", ".") + minor = "." + minor + else: + chapter = ch + minor = "" + + data = manga["chapters"][ch] + group_id, self._files = next(iter(data["groups"].items())) + + if not self.zip: + self.base = (f"{self.root}/media/manga/{slug}/chapters" + f"/{data['folder']}/{group_id}/") + + return { + "manga" : manga["title"], + "manga_slug": manga["slug"], + "title" : data["title"], + "volume" : text.parse_int(data["volume"]), + "chapter" : text.parse_int(chapter), + "chapter_minor": minor, + "group" : manga["groups"][group_id].split(" & "), + "group_id" : text.parse_int(group_id), + "date" : text.parse_timestamp(data["release_date"][group_id]), + "lang" : util.NONE, + "language" : util.NONE, + } + + def images(self, page): + if self.zip: + return () + + base = self.base + return [(base + file, None) for file in self._files] + + def assets(self, page): + if self.zip: + slug, ch = self.groups + url = f"{self.root}/api/download_chapter/{slug}/{ch}/" + return ({ + "type" : "archive", + "extension": "zip", + "url" : url, + },) + + +class DankefuerslesenMangaExtractor(DankefuerslesenBase, MangaExtractor): + """Extractor for Danke fürs Lesen manga""" + chapterclass = DankefuerslesenChapterExtractor + reverse = False + pattern = BASE_PATTERN + r"/read/manga/([^/?#]+)" + example = "https://danke.moe/read/manga/TITLE/" + + def chapters(self, page): + results = [] + + manga = self._manga_info(self.groups[0]).copy() + manga["lang"] = util.NONE + manga["language"] = util.NONE + + base = f"{self.root}/read/manga/{manga['slug']}/" + for ch, data in manga.pop("chapters").items(): + + if "." in ch: + chapter, sep, minor = ch.rpartition(".") + ch = ch.replace('.', '-') + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + else: + data["chapter"] = text.parse_int(ch) + data["chapter_minor"] = "" + + manga.update(data) + results.append((f"{base}{ch}/1/", manga)) + + return results diff --git a/gallery_dl/extractor/desktopography.py b/gallery_dl/extractor/desktopography.py index 35bb299..364d88f 100644 --- a/gallery_dl/extractor/desktopography.py +++ b/gallery_dl/extractor/desktopography.py @@ -46,10 +46,10 @@ class DesktopographyExhibitionExtractor(DesktopographyExtractor): def __init__(self, match): DesktopographyExtractor.__init__(self, match) - self.year = match.group(1) + self.year = match[1] def items(self): - url = "{}/exhibition-{}/".format(self.root, self.year) + url = f"{self.root}/exhibition-{self.year}/" base_entry_url = "https://desktopography.net/portfolios/" page = self.request(url).text @@ -75,10 +75,10 @@ class DesktopographyEntryExtractor(DesktopographyExtractor): def __init__(self, match): DesktopographyExtractor.__init__(self, match) - self.entry = match.group(1) + self.entry = match[1] def items(self): - url = "{}/portfolios/{}".format(self.root, self.entry) + url = f"{self.root}/portfolios/{self.entry}" page = self.request(url).text entry_data = {"entry": self.entry} diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 37f57fe..66e2a1e 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,14 +8,13 @@ """Extractors for https://www.deviantart.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache, memcache import collections import mimetypes import binascii import time -import re BASE_PATTERN = ( r"(?:https?://)?(?:" @@ -37,7 +36,7 @@ class DeviantartExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = (match.group(1) or match.group(2) or "").lower() + self.user = (match[1] or match[2] or "").lower() self.offset = 0 def _init(self): @@ -56,8 +55,7 @@ class DeviantartExtractor(Extractor): self.group = False self._premium_cache = {} - unwatch = self.config("auto-unwatch") - if unwatch: + if self.config("auto-unwatch"): self.unwatch = [] self.finalize = self._unwatch_premium else: @@ -66,10 +64,13 @@ class DeviantartExtractor(Extractor): if self.quality: if self.quality == "png": self.quality = "-fullview.png?" - self.quality_sub = re.compile(r"-fullview\.[a-z0-9]+\?").sub + self.quality_sub = util.re(r"-fullview\.[a-z0-9]+\?").sub else: - self.quality = ",q_{}".format(self.quality) - self.quality_sub = re.compile(r",q_\d+").sub + self.quality = f",q_{self.quality}" + self.quality_sub = util.re(r",q_\d+").sub + + if self.intermediary: + self.intermediary_subn = util.re(r"(/f/[^/]+/[^/]+)/v\d+/.*").subn if isinstance(self.original, str) and \ self.original.lower().startswith("image"): @@ -116,15 +117,13 @@ class DeviantartExtractor(Extractor): def items(self): if self.user: - group = self.config("group", True) - if group: - user = _user_details(self, self.user) - if user: + if group := self.config("group", True): + if user := _user_details(self, self.user): self.user = user["username"] self.group = False elif group == "skip": self.log.info("Skipping group '%s'", self.user) - raise exception.StopExtraction() + raise exception.AbortExtraction() else: self.subcategory = "group-" + self.subcategory self.group = True @@ -177,8 +176,7 @@ class DeviantartExtractor(Extractor): yield self.commit(deviation, deviation["flash"]) if self.commit_journal: - journal = self._extract_journal(deviation) - if journal: + if journal := self._extract_journal(deviation): if self.extra: deviation["_journal"] = journal["html"] deviation["is_original"] = True @@ -194,7 +192,7 @@ class DeviantartExtractor(Extractor): continue _user_details.update(name, user) - url = "{}/{}/avatar/".format(self.root, name) + url = f"{self.root}/{name}/avatar/" comment["_extractor"] = DeviantartAvatarExtractor yield Message.Queue, url, comment @@ -225,7 +223,7 @@ class DeviantartExtractor(Extractor): if txt is None: continue for match in DeviantartStashExtractor.pattern.finditer(txt): - url = text.ensure_http_scheme(match.group(0)) + url = text.ensure_http_scheme(match[0]) deviation["_extractor"] = DeviantartStashExtractor yield Message.Queue, url, deviation @@ -271,15 +269,14 @@ class DeviantartExtractor(Extractor): ) # filename metadata - sub = re.compile(r"\W").sub + sub = util.re(r"\W").sub deviation["filename"] = "".join(( sub("_", deviation["title"].lower()), "_by_", sub("_", deviation["author"]["username"].lower()), "-d", deviation["index_base36"], )) - @staticmethod - def commit(deviation, target): + def commit(self, deviation, target): url = target["src"] name = target.get("filename") or url target = target.copy() @@ -321,7 +318,7 @@ class DeviantartExtractor(Extractor): header = HEADER_TEMPLATE.format( title=title, url=url, - userurl="{}/{}/".format(self.root, urlname), + userurl=f"{self.root}/{urlname}/", username=username, date=deviation["date"], ) @@ -388,8 +385,7 @@ class DeviantartExtractor(Extractor): deviations = state["@@entities"]["deviation"] content = deviations.popitem()[1]["textContent"] - html = self._textcontent_to_html(deviation, content) - if html: + if html := self._textcontent_to_html(deviation, content): return {"html": html} return {"html": content["excerpt"].replace("\n", "
")} @@ -431,12 +427,11 @@ class DeviantartExtractor(Extractor): type = content["type"] if type == "paragraph": - children = content.get("content") - if children: + if children := content.get("content"): html.append('

\ if content["src"].startswith("https://images-wixmp-"): if self.intermediary and deviation["index"] <= 790677560: # https://github.com/r888888888/danbooru/issues/4069 - intermediary, count = re.subn( - r"(/f/[^/]+/[^/]+)/v\d+/.*", + intermediary, count = self.intermediary_subn( r"/intermediary\1", content["src"], 1) if count: deviation["is_original"] = False @@ -679,11 +671,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ return content - @staticmethod - def _find_folder(folders, name, uuid): + def _find_folder(self, folders, name, uuid): if uuid.isdecimal(): - match = re.compile(name.replace( - "-", r"[^a-z0-9]+") + "$", re.IGNORECASE).match + match = util.re( + "(?i)" + name.replace("-", "[^a-z0-9]+") + "$").match for folder in folders: if match(folder["name"]): return folder @@ -702,10 +693,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ raise exception.NotFoundError("folder") def _folder_urls(self, folders, category, extractor): - base = "{}/{}/{}/".format(self.root, self.user, category) + base = f"{self.root}/{self.user}/{category}/" for folder in folders: folder["_extractor"] = extractor - url = "{}{}/{}".format(base, folder["folderid"], folder["name"]) + url = f"{base}{folder['folderid']}/{folder['name']}" yield url, folder def _update_content_default(self, deviation, content): @@ -748,13 +739,10 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ deviation["_fallback"] = (content["src"],) deviation["is_original"] = True + pl = binascii.b2a_base64(payload).rstrip(b'=\n').decode() content["src"] = ( - "{}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{}.".format( - url, - # base64 of 'header' is precomputed as 'eyJ0eX...' - # binascii.b2a_base64(header).rstrip(b"=\n").decode(), - binascii.b2a_base64(payload).rstrip(b"=\n").decode()) - ) + # base64 of 'header' is precomputed as 'eyJ0eX...' + f"{url}?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJub25lIn0.{pl}.") def _extract_comments(self, target_id, target_type="deviation"): results = None @@ -845,8 +833,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ for fmt in media["types"] } - tokens = media.get("token") or () - if tokens: + if tokens := media.get("token") or (): if len(tokens) <= 1: fmt = formats[format] if "c" in fmt: @@ -873,19 +860,13 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ .replace("\\\\", "\\") -class DeviantartUserExtractor(DeviantartExtractor): +class DeviantartUserExtractor(Dispatch, DeviantartExtractor): """Extractor for an artist's user profile""" - subcategory = "user" pattern = BASE_PATTERN + r"/?$" example = "https://www.deviantart.com/USER" - def initialize(self): - pass - - skip = Extractor.skip - def items(self): - base = "{}/{}/".format(self.root, self.user) + base = f"{self.root}/{self.user}/" return self._dispatch_extractors(( (DeviantartAvatarExtractor , base + "avatar"), (DeviantartBackgroundExtractor, base + "banner"), @@ -950,8 +931,8 @@ class DeviantartAvatarExtractor(DeviantartExtractor): fmt, _, ext = fmt.rpartition(".") if fmt: fmt = "-" + fmt - url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format( - fmt, name[0], name[1], name, ext, index) + url = (f"https://a.deviantart.net/avatars{fmt}" + f"/{name[0]}/{name[1]}/{name}.{ext}?{index}") results.append(self._make_deviation(url, user, index, fmt)) return results @@ -995,8 +976,8 @@ class DeviantartFolderExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) self.folder = None - self.folder_id = match.group(3) - self.folder_name = match.group(4) + self.folder_id = match[3] + self.folder_name = match[4] def deviations(self): folders = self.api.gallery_folders(self.user) @@ -1049,7 +1030,7 @@ class DeviantartStashExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.user = None + self.user = "" def deviations(self, stash_id=None, stash_data=None): if stash_id is None: @@ -1067,8 +1048,7 @@ class DeviantartStashExtractor(DeviantartExtractor): page = self._limited_request(url).text if stash_id[0] == "0": - uuid = text.extr(page, '//deviation/', '"') - if uuid: + if uuid := text.extr(page, '//deviation/', '"'): deviation = self.api.deviation(uuid) deviation["_page"] = page deviation["index"] = text.parse_int(text.extr( @@ -1091,8 +1071,7 @@ class DeviantartStashExtractor(DeviantartExtractor): yield deviation return - stash_data = text.extr(page, ',\\"stash\\":', ',\\"@@') - if stash_data: + if stash_data := text.extr(page, ',\\"stash\\":', ',\\"@@'): stash_data = util.json_loads(self._unescape_json(stash_data)) for sid in text.extract_iter( @@ -1130,8 +1109,8 @@ class DeviantartCollectionExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) self.collection = None - self.collection_id = match.group(3) - self.collection_name = match.group(4) + self.collection_id = match[3] + self.collection_name = match[4] def deviations(self): folders = self.api.collections_folders(self.user) @@ -1173,15 +1152,15 @@ class DeviantartStatusExtractor(DeviantartExtractor): def deviations(self): for status in self.api.user_statuses(self.user, self.offset): - yield from self.status(status) + yield from self.process_status(status) - def status(self, status): + def process_status(self, status): for item in status.get("items") or (): # do not trust is_share # shared deviations/statuses if "deviation" in item: yield item["deviation"].copy() if "status" in item: - yield from self.status(item["status"].copy()) + yield from self.process_status(item["status"].copy()) # assume is_deleted == true means necessary fields are missing if status["is_deleted"]: self.log.warning( @@ -1233,7 +1212,8 @@ class DeviantartTagExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.tag = text.unquote(match.group(1)) + self.tag = text.unquote(match[1]) + self.user = "" def deviations(self): return self.api.browse_tags(self.tag, self.offset) @@ -1282,16 +1262,16 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.type = match.group(3) + self.type = match[3] self.deviation_id = \ - match.group(4) or match.group(5) or id_from_base36(match.group(6)) + match[4] or match[5] or id_from_base36(match[6]) def deviations(self): if self.user: - url = "{}/{}/{}/{}".format( - self.root, self.user, self.type or "art", self.deviation_id) + url = (f"{self.root}/{self.user}" + f"/{self.type or 'art'}/{self.deviation_id}") else: - url = "{}/view/{}/".format(self.root, self.deviation_id) + url = f"{self.root}/view/{self.deviation_id}/" page = self._limited_request(url, notfound="deviation").text uuid = text.extr(page, '"deviationUuid\\":\\"', '\\') @@ -1379,7 +1359,7 @@ class DeviantartSearchExtractor(DeviantartExtractor): response = self.request(url, params=params) if response.history and "/users/login" in response.url: - raise exception.StopExtraction("HTTP redirect to login page") + raise exception.AbortExtraction("HTTP redirect to login page") page = response.text for dev in DeviantartDeviationExtractor.pattern.findall( @@ -1405,7 +1385,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) - self.query = match.group(3) + self.query = match[3] def deviations(self): self.login() @@ -1437,7 +1417,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor): api = DeviantartOAuthAPI(self) for user in api.user_friends(self.user): - url = "{}/{}".format(self.root, user["user"]["username"]) + url = f"{self.root}/{user['user']['username']}" user["_extractor"] = DeviantartUserExtractor yield Message.Queue, url, user @@ -1470,8 +1450,7 @@ class DeviantartOAuthAPI(): self.folders = extractor.config("folders", False) self.public = extractor.config("public", True) - client_id = extractor.config("client-id") - if client_id: + if client_id := extractor.config("client-id"): self.client_id = str(client_id) self.client_secret = extractor.config("client-secret") else: @@ -1585,7 +1564,7 @@ class DeviantartOAuthAPI(): def comments(self, target_id, target_type="deviation", comment_id=None, offset=0): """Fetch comments posted on a target""" - endpoint = "/comments/{}/{}".format(target_type, target_id) + endpoint = f"/comments/{target_type}/{target_id}" params = { "commentid" : comment_id, "maxdepth" : "5", @@ -1639,7 +1618,7 @@ class DeviantartOAuthAPI(): def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" endpoint = "/deviation/metadata?" + "&".join( - "deviationids[{}]={}".format(num, deviation["deviationid"]) + f"deviationids[{num}]={deviation['deviationid']}" for num, deviation in enumerate(deviations) ) return self._call( @@ -1746,8 +1725,8 @@ class DeviantartOAuthAPI(): if response.status_code != 200: self.log.debug("Server response: %s", data) - raise exception.AuthenticationError('"{}" ({})'.format( - data.get("error_description"), data.get("error"))) + raise exception.AuthenticationError( + f"\"{data.get('error_description')}\" ({data.get('error')})") if refresh_token_key: _refresh_token_cache.update( refresh_token_key, data["refresh_token"]) @@ -1790,8 +1769,7 @@ class DeviantartOAuthAPI(): raise exception.AuthorizationError() self.log.debug(response.text) - msg = "API responded with {} {}".format( - status, response.reason) + msg = f"API responded with {status} {response.reason}" if status == 429: if self.delay < 30: self.delay += 1 @@ -1889,12 +1867,9 @@ class DeviantartOAuthAPI(): params["offset"] = int(params["offset"]) + len(results) def _pagination_list(self, endpoint, params, key="results"): - result = [] - result.extend(self._pagination(endpoint, params, False, key=key)) - return result + return list(self._pagination(endpoint, params, False, key=key)) - @staticmethod - def _shared_content(results): + def _shared_content(self, results): """Return an iterable of shared deviations in 'results'""" for result in results: for item in result.get("items") or (): @@ -2075,7 +2050,7 @@ class DeviantartEclipseAPI(): params["offset"] = int(params["offset"]) + len(results) def _ids_watching(self, user): - url = "{}/{}/about".format(self.extractor.root, user) + url = f"{self.extractor.root}/{user}/about" page = self.request(url).text gruser_id = text.extr(page, ' data-userid="', '"') @@ -2083,8 +2058,7 @@ class DeviantartEclipseAPI(): pos = page.find('\\"name\\":\\"watching\\"') if pos < 0: raise exception.NotFoundError("'watching' module ID") - module_id = text.rextract( - page, '\\"id\\":', ',', pos)[0].strip('" ') + module_id = text.rextr(page, '\\"id\\":', ',', pos).strip('" ') self._fetch_csrf_token(page) return gruser_id, module_id diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 4559aff..85358ba 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,9 +24,9 @@ class DirectlinkExtractor(Extractor): example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png" def __init__(self, match): - Extractor.__init__(self, match) self.data = data = match.groupdict() self.subcategory = ".".join(data["domain"].rsplit(".", 2)[-2:]) + Extractor.__init__(self, match) def items(self): data = self.data diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py index ac21fec..216e486 100644 --- a/gallery_dl/extractor/discord.py +++ b/gallery_dl/extractor/discord.py @@ -22,8 +22,6 @@ class DiscordExtractor(Extractor): filename_fmt = "{message_id}_{num:>02}_{filename}.{extension}" archive_fmt = "{message_id}_{num}" - cdn_fmt = "https://cdn.discordapp.com/{}/{}/{}.png?size=4096" - server_metadata = {} server_channels_metadata = {} @@ -86,44 +84,50 @@ class DiscordExtractor(Extractor): ): if message["author"].get(icon_type): message_metadata["author_files"].append({ - "url": self.cdn_fmt.format( - icon_path, - message_metadata["author_id"], - message["author"][icon_type] - ), + "url": (f"https://cdn.discordapp.com/{icon_path}/" + f"{message_metadata['author_id']}/" + f"{message['author'][icon_type]}.png" + f"?size=4096"), "filename": icon_type, "extension": "png", }) - for attachment in message["attachments"]: - message_metadata["files"].append({ - "url": attachment["url"], - "type": "attachment", - }) + message_snapshots = [message] + message_snapshots.extend( + msg["message"] for msg in message.get("message_snapshots", []) + if msg["message"]["type"] in (0, 19, 21) + ) + + for snapshot in message_snapshots: + for attachment in snapshot["attachments"]: + message_metadata["files"].append({ + "url": attachment["url"], + "type": "attachment", + }) - for embed in message["embeds"]: - if embed["type"] in self.enabled_embeds: - for field in ("video", "image", "thumbnail"): - if field not in embed: - continue - url = embed[field].get("proxy_url") - if url is not None: - message_metadata["files"].append({ - "url": url, - "type": "embed", - }) - break - - for num, file in enumerate(message_metadata["files"], start=1): - text.nameext_from_url(file["url"], file) - file["num"] = num - - yield Message.Directory, message_metadata - - for file in message_metadata["files"]: - message_metadata_file = message_metadata.copy() - message_metadata_file.update(file) - yield Message.Url, file["url"], message_metadata_file + for embed in snapshot["embeds"]: + if embed["type"] in self.enabled_embeds: + for field in ("video", "image", "thumbnail"): + if field not in embed: + continue + url = embed[field].get("proxy_url") + if url is not None: + message_metadata["files"].append({ + "url": url, + "type": "embed", + }) + break + + for num, file in enumerate(message_metadata["files"], start=1): + text.nameext_from_url(file["url"], file) + file["num"] = num + + yield Message.Directory, message_metadata + + for file in message_metadata["files"]: + message_metadata_file = message_metadata.copy() + message_metadata_file.update(file) + yield Message.Url, file["url"], message_metadata_file def extract_channel_text(self, channel_id): for message in self.api.get_channel_messages(channel_id): @@ -158,7 +162,7 @@ class DiscordExtractor(Extractor): yield from self.extract_channel( channel["channel_id"], safe=True) elif not safe: - raise exception.StopExtraction( + raise exception.AbortExtraction( "This channel type is not supported." ) except exception.HttpError as exc: @@ -215,11 +219,9 @@ class DiscordExtractor(Extractor): ): if server.get(icon_type): self.server_metadata["server_files"].append({ - "url": self.cdn_fmt.format( - icon_path, - self.server_metadata["server_id"], - server[icon_type] - ), + "url": (f"https://cdn.discordapp.com/{icon_path}/" + f"{self.server_metadata['server_id']}/" + f"{server[icon_type]}.png?size=4096"), "filename": icon_type, "extension": "png", }) @@ -342,7 +344,7 @@ class DiscordAPI(): "sort_order": "desc", "limit": THREADS_BATCH, "offset": + offset, - })["threads"] + }).get("threads", []) return self._pagination(_method, THREADS_BATCH) @@ -391,8 +393,7 @@ class DiscordAPI(): return offset += len(data) - @staticmethod - def _raise_invalid_token(): + def _raise_invalid_token(self): raise exception.AuthenticationError("""Invalid or missing token. Please provide a valid token following these instructions: diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 583869f..3e0424d 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor, Extractor, Message from .. import text, util -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -21,7 +20,7 @@ class DynastyscansBase(): root = "https://dynasty-scans.com" def _parse_image_page(self, image_id): - url = "{}/images/{}".format(self.root, image_id) + url = f"{self.root}/images/{image_id}" extr = text.extract_from(self.request(url).text) date = extr("class='create_at'>", "") @@ -47,20 +46,19 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): def metadata(self, page): extr = text.extract_from(page) - match = re.match( - (r"(?:]*>)?([^<]+)(?:)?" # manga name - r"(?: ch(\d+)([^:<]*))?" # chapter info - r"(?:: (.+))?"), # title - extr("

", ""), - ) + match = util.re( + r"(?:]*>)?([^<]+)(?:)?" # manga name + r"(?: ch(\d+)([^:<]*))?" # chapter info + r"(?:: (.+))?" # title + ).match(extr("

", "")) author = extr(" by ", "") group = extr('"icon-print"> ', '') return { - "manga" : text.unescape(match.group(1)), - "chapter" : text.parse_int(match.group(2)), - "chapter_minor": match.group(3) or "", - "title" : text.unescape(match.group(4) or ""), + "manga" : text.unescape(match[1]), + "chapter" : text.parse_int(match[2]), + "chapter_minor": match[3] or "", + "title" : text.unescape(match[4] or ""), "author" : text.remove_html(author), "group" : (text.remove_html(group) or text.extr(group, ' alt="', '"')), @@ -104,7 +102,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.query = match.group(1) or "" + self.query = match[1] or "" def items(self): yield Message.Directory, {} @@ -133,3 +131,43 @@ class DynastyscansImageExtractor(DynastyscansSearchExtractor): def images(self): return (self.query,) + + +class DynastyscansAnthologyExtractor(DynastyscansSearchExtractor): + """Extractor for dynasty-scans anthologies""" + subcategory = "anthology" + pattern = BASE_PATTERN + r"/anthologies/([^/?#]+)" + example = "https://dynasty-scans.com/anthologies/TITLE" + + def items(self): + url = f"{self.root}/anthologies/{self.groups[0]}.atom" + root = self.request_xml(url, xmlns=False) + + data = { + "_extractor": DynastyscansChapterExtractor, + "anthology" : root[3].text[28:], + } + + if self.config("metadata", False): + page = self.request(url[:-5]).text + alert = text.extr(page, "
", "
") + + for element in root: + if element.tag != "entry": + continue + content = element[6][0] + data["author"] = content[0].text[8:] + data["scanlator"] = content[1].text[11:] + data["tags"] = content[2].text[6:].lower().split(", ") + data["title"] = element[5].text + data["date"] = text.parse_datetime( + element[1].text, "%Y-%m-%dT%H:%M:%S%z") + data["date_updated"] = text.parse_datetime( + element[2].text, "%Y-%m-%dT%H:%M:%S%z") + yield Message.Queue, element[4].text, data diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 76ea792..71c3b30 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -24,8 +24,7 @@ class E621Extractor(danbooru.DanbooruExtractor): request_interval_min = 1.0 def items(self): - includes = self.config("metadata") or () - if includes: + if includes := self.config("metadata") or (): if isinstance(includes, str): includes = includes.split(",") elif not isinstance(includes, (list, tuple)): @@ -40,8 +39,8 @@ class E621Extractor(danbooru.DanbooruExtractor): if not file["url"]: md5 = file["md5"] - file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format( - self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]) + file["url"] = (f"https://static1.{self.root[8:]}/data" + f"/{md5[0:2]}/{md5[2:4]}/{md5}.{file['ext']}") if notes and post.get("has_notes"): post["notes"] = self._get_notes(post["id"]) @@ -60,13 +59,13 @@ class E621Extractor(danbooru.DanbooruExtractor): yield Message.Url, file["url"], post def _get_notes(self, id): - return self.request( - "{}/notes.json?search[post_id]={}".format(self.root, id)).json() + return self.request_json( + f"{self.root}/notes.json?search[post_id]={id}") @memcache(keyarg=1) def _get_pools(self, ids): - pools = self.request( - "{}/pools.json?search[id]={}".format(self.root, ids)).json() + pools = self.request_json( + f"{self.root}/pools.json?search[id]={ids}") for pool in pools: pool["name"] = pool["name"].replace("_", " ") return pools @@ -75,7 +74,7 @@ class E621Extractor(danbooru.DanbooruExtractor): BASE_PATTERN = E621Extractor.update({ "e621": { "root": "https://e621.net", - "pattern": r"e621\.net", + "pattern": r"e621\.(?:net|cc)", }, "e926": { "root": "https://e926.net", @@ -109,12 +108,11 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): } posts = [] - append = posts.append for num, pid in enumerate(self.post_ids, 1): if pid in id_to_post: post = id_to_post[pid] post["num"] = num - append(post) + posts.append(post) else: self.log.warning("Post %s is unavailable", pid) return posts @@ -126,8 +124,8 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): example = "https://e621.net/posts/12345" def posts(self): - url = "{}/posts/{}.json".format(self.root, self.groups[-1]) - return (self.request(url).json()["post"],) + url = f"{self.root}/posts/{self.groups[-1]}.json" + return (self.request_json(url)["post"],) class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 7582528..7beeac5 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,62 +22,20 @@ class EromeExtractor(Extractor): filename_fmt = "{album_id} {title} {num:>02}.{extension}" archive_fmt = "{album_id}_{num}" root = "https://www.erome.com" + _cookies = True def items(self): - self.__cookies = True + base = f"{self.root}/a/" + data = {"_extractor": EromeAlbumExtractor} for album_id in self.albums(): - url = "{}/a/{}".format(self.root, album_id) - - try: - page = self.request(url).text - except exception.HttpError as exc: - self.log.warning( - "Unable to fetch album '%s' (%s)", album_id, exc) - continue - - title, pos = text.extract( - page, 'property="og:title" content="', '"') - pos = page.index('
', pos) - - urls = [] - date = None - groups = page.split('
1: - date = text.parse_timestamp(ts) - - data = { - "album_id": album_id, - "title" : text.unescape(title), - "user" : text.unquote(user), - "count" : len(urls), - "date" : date, - "tags" : ([t.replace("+", " ") - for t in text.extract_iter(tags, "?q=", '"')] - if tags else ()), - "_http_headers": {"Referer": url}, - } - - yield Message.Directory, data - for data["num"], url in enumerate(urls, 1): - yield Message.Url, url, text.nameext_from_url(url, data) + yield Message.Queue, f"{base}{album_id}", data def albums(self): return () def request(self, url, **kwargs): - if self.__cookies: - self.__cookies = False + if self._cookies: + self._cookies = False self.cookies.update(_cookie_cache()) for _ in range(5): @@ -106,8 +64,52 @@ class EromeAlbumExtractor(EromeExtractor): pattern = BASE_PATTERN + r"/a/(\w+)" example = "https://www.erome.com/a/ID" - def albums(self): - return (self.groups[0],) + def items(self): + album_id = self.groups[0] + url = f"{self.root}/a/{album_id}" + + try: + page = self.request(url).text + except exception.HttpError as exc: + raise exception.AbortExtraction( + f"{album_id}: Unable to fetch album page ({exc})") + + title, pos = text.extract( + page, 'property="og:title" content="', '"') + pos = page.index('
', pos) + + urls = [] + date = None + groups = page.split('
1: + date = text.parse_timestamp(ts) + + data = { + "album_id": album_id, + "title" : text.unescape(title), + "user" : text.unquote(user), + "count" : len(urls), + "date" : date, + "tags" : ([t.replace("+", " ") + for t in text.extract_iter(tags, "?q=", '"')] + if tags else ()), + "_http_headers": {"Referer": url}, + } + + yield Message.Directory, data + for data["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, data) class EromeUserExtractor(EromeExtractor): @@ -116,7 +118,7 @@ class EromeUserExtractor(EromeExtractor): example = "https://www.erome.com/USER" def albums(self): - url = "{}/{}".format(self.root, self.groups[0]) + url = f"{self.root}/{self.groups[0]}" return self._pagination(url, {}) diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index 3bf0a74..787786e 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -7,8 +7,7 @@ """Extractors for https://everia.club""" from .common import Extractor, Message -from .. import text -import re +from .. import text, util BASE_PATTERN = r"(?:https?://)?everia\.club" @@ -26,13 +25,13 @@ class EveriaExtractor(Extractor): return self._pagination(self.groups[0]) def _pagination(self, path, params=None, pnum=1): - find_posts = re.compile(r'thumbnail">\s*\s*= 300: @@ -50,16 +49,16 @@ class EveriaPostExtractor(EveriaExtractor): example = "https://everia.club/0000/00/00/TITLE" def items(self): - url = self.root + self.groups[0] + url = self.root + self.groups[0] + "/" page = self.request(url).text content = text.extr(page, 'itemprop="text">', "', "', "")), - "post_url": url, + "post_url": text.unquote(url), "post_category": text.extr( page, "post-in-category-", " ").capitalize(), "count": len(urls), @@ -67,6 +66,7 @@ class EveriaPostExtractor(EveriaExtractor): yield Message.Directory, data for data["num"], url in enumerate(urls, 1): + url = text.unquote(url) yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index e7ba78e..f147959 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -34,7 +34,7 @@ class ExhentaiExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.version = match.group(1) + self.version = match[1] def initialize(self): domain = self.config("domain", "auto") @@ -59,7 +59,7 @@ class ExhentaiExtractor(Extractor): def login(self): """Login and set necessary cookies""" if self.LIMIT: - raise exception.StopExtraction("Image limit reached!") + raise exception.AbortExtraction("Image limit reached!") if self.cookies_check(self.cookies_names): return @@ -122,10 +122,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - self.gallery_id = text.parse_int(match.group(2) or match.group(5)) - self.gallery_token = match.group(3) - self.image_token = match.group(4) - self.image_num = text.parse_int(match.group(6), 1) + self.gallery_id = text.parse_int(match[2] or match[5]) + self.gallery_token = match[3] + self.image_token = match[4] + self.image_num = text.parse_int(match[6], 1) self.key_start = None self.key_show = None self.key_next = None @@ -136,11 +136,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): source = self.config("source") if source == "hitomi": self.items = self._items_hitomi + elif source == "metadata": + self.items = self._items_metadata limits = self.config("limits", False) if limits and limits.__class__ is int: self.limits = limits - self._remaining = 0 + self._limits_remaining = 0 else: self.limits = False @@ -176,7 +178,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.image_token = text.extr(gpage, 'hentai.org/s/', '"') if not self.image_token: self.log.debug("Page content:\n%s", gpage) - raise exception.StopExtraction( + raise exception.AbortExtraction( "Failed to extract initial image token") ipage = self._image_page() else: @@ -184,7 +186,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): part = text.extr(ipage, 'hentai.org/g/', '"') if not part: self.log.debug("Page content:\n%s", ipage) - raise exception.StopExtraction( + raise exception.AbortExtraction( "Failed to extract gallery token") self.gallery_token = part.split("/")[1] gpage = self._gallery_page() @@ -198,11 +200,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): for url, image in images: data.update(image) if self.limits: - self._check_limits(data) + self._limits_check(data) if "/fullimg" in url: data["_http_validate"] = self._validate_response else: data["_http_validate"] = None + data["_http_signature"] = self._validate_signature yield Message.Url, url, data fav = self.config("fav") @@ -218,10 +221,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data = {} from .hitomi import HitomiGalleryExtractor - url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id) + url = f"https://hitomi.la/galleries/{self.gallery_id}.html" data["_extractor"] = HitomiGalleryExtractor yield Message.Queue, url, data + def _items_metadata(self): + yield Message.Directory, self.metadata_from_api() + def get_metadata(self, page): """Extract gallery metadata""" data = self.metadata_from_page(page) @@ -240,8 +246,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) - api_url = extr('var api_url = "', '"') - if api_url: + if api_url := extr('var api_url = "', '"'): self.api_url = api_url data = { @@ -293,9 +298,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "namespace": 1, } - data = self.request(self.api_url, method="POST", json=data).json() + data = self.request_json(self.api_url, method="POST", json=data) if "error" in data: - raise exception.StopExtraction(data["error"]) + raise exception.AbortExtraction(data["error"]) return data["gmetadata"][0] @@ -320,8 +325,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["_fallback"] = self._fallback_1280(nl, self.image_num) except IndexError: self.log.debug("Page content:\n%s", page) - raise exception.StopExtraction( - "Unable to parse image info for '%s'", url) + raise exception.AbortExtraction( + f"Unable to parse image info for '{url}'") data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') @@ -345,7 +350,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): } for request["page"] in range(self.image_num + 1, self.count + 1): - page = self.request(api_url, method="POST", json=request).json() + page = self.request_json(api_url, method="POST", json=request) i3 = page["i3"] i6 = page["i6"] @@ -371,8 +376,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): nl, request["page"], imgkey) except IndexError: self.log.debug("Page content:\n%s", page) - raise exception.StopExtraction( - "Unable to parse image info for '%s'", url) + raise exception.AbortExtraction( + f"Unable to parse image info for '{url}'") data["num"] = request["page"] data["image_token"] = imgkey @@ -385,66 +390,106 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): request["imgkey"] = nextkey def _validate_response(self, response): - if not response.history and response.headers.get( + if response.history or not response.headers.get( "content-type", "").startswith("text/html"): - page = response.text - self.log.warning("'%s'", page) - - if " requires GP" in page: - gp = self.config("gp") - if gp == "stop": - raise exception.StopExtraction("Not enough GP") - elif gp == "wait": - input("Press ENTER to continue.") - return response.url - - self.log.info("Falling back to non-original downloads") - self.original = False - return self.data["_url_1280"] - - if " temporarily banned " in page: - raise exception.AuthorizationError("Temporarily Banned") - - self._report_limits() - return True - - def _report_limits(self): - ExhentaiExtractor.LIMIT = True - raise exception.StopExtraction("Image limit reached!") - - def _check_limits(self, data): - if not self._remaining or data["num"] % 25 == 0: - self._update_limits() - self._remaining -= data["cost"] - if self._remaining <= 0: - self._report_limits() - - def _check_509(self, url): - # full 509.gif URLs - # - https://exhentai.org/img/509.gif - # - https://ehgt.org/g/509.gif - if url.endswith(("hentai.org/img/509.gif", - "ehgt.org/g/509.gif")): - self.log.debug(url) - self._report_limits() + return True - def _update_limits(self): + page = response.text + self.log.warning("'%s'", page) + + if " requires GP" in page: + gp = self.config("gp") + if gp == "stop": + raise exception.AbortExtraction("Not enough GP") + elif gp == "wait": + self.input("Press ENTER to continue.") + return response.url + + self.log.info("Falling back to non-original downloads") + self.original = False + return self.data["_url_1280"] + + if " temporarily banned " in page: + raise exception.AuthorizationError("Temporarily Banned") + + self._limits_exceeded() + return response.url + + def _validate_signature(self, signature): + """Return False if all file signature bytes are zero""" + if signature: + if byte := signature[0]: + # 60 == b"<" + if byte == 60 and b"", "").replace(",", "") self.log.debug("Image Limits: %s/%s", current, self.limits) - self._remaining = self.limits - text.parse_int(current) + self._limits_remaining = self.limits - text.parse_int(current) + + return page + + def _check_509(self, url): + # full 509.gif URLs + # - https://exhentai.org/img/509.gif + # - https://ehgt.org/g/509.gif + if url.endswith(("hentai.org/img/509.gif", + "ehgt.org/g/509.gif")): + self.log.debug(url) + self._limits_exceeded() + + def _limits_exceeded(self): + msg = "Image limit exceeded!" + action = self.config("limits-action") + + if not action or action == "stop": + ExhentaiExtractor.LIMIT = True + raise exception.AbortExtraction(msg) + + self.log.warning(msg) + if action == "wait": + self.input("Press ENTER to continue.") + self._limits_update() + elif action == "reset": + self._limits_reset() + else: + self.log.error("Invalid 'limits-action' value '%s'", action) + + def _limits_check(self, data): + if not self._limits_remaining or data["num"] % 25 == 0: + self._limits_update() + self._limits_remaining -= data["cost"] + if self._limits_remaining <= 0: + self._limits_exceeded() + + def _limits_reset(self): + self.log.info("Resetting image limits") + self._request_home( + method="POST", + headers={"Content-Type": "application/x-www-form-urlencoded"}, + data=b"reset_imagelimit=Reset+Quota") + + _limits_update = _request_home def _gallery_page(self): - url = "{}/g/{}/{}/".format( - self.root, self.gallery_id, self.gallery_token) + url = f"{self.root}/g/{self.gallery_id}/{self.gallery_token}/" response = self.request(url, fatal=False) page = response.text @@ -457,8 +502,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return page def _image_page(self): - url = "{}/s/{}/{}-{}".format( - self.root, self.image_token, self.gallery_id, self.image_num) + url = (f"{self.root}/s/{self.image_token}" + f"/{self.gallery_id}-{self.image_num}") page = self.request(url, fatal=False).text if page.startswith(("Invalid page", "Keep trying")): @@ -466,7 +511,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return page def _fallback_original(self, nl, fullimg): - url = "{}?nl={}".format(fullimg, nl) + url = f"{fullimg}?nl={nl}" for _ in util.repeat(self.fallback_retries): yield url @@ -475,8 +520,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): token = self.key_start for _ in util.repeat(self.fallback_retries): - url = "{}/s/{}/{}-{}?nl={}".format( - self.root, token, self.gallery_id, num, nl) + url = f"{self.root}/s/{token}/{self.gallery_id}-{num}?nl={nl}" page = self.request(url, fatal=False).text if page.startswith(("Invalid page", "Keep trying")): @@ -486,8 +530,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): nl = data["_nl"] - @staticmethod - def _parse_image_info(url): + def _parse_image_info(self, url): for part in url.split("/")[4:]: try: _, size, width, height, _ = part.split("-") @@ -504,8 +547,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "height": text.parse_int(height), } - @staticmethod - def _parse_original_info(info): + def _parse_original_info(self, info): parts = info.lstrip().split(" ") size = text.parse_bytes(parts[3] + parts[4][0]) @@ -527,11 +569,11 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - _, query, tag = match.groups() + _, query, tag = self.groups if tag: if "+" in tag: ns, _, tag = tag.rpartition(":") - tag = '{}:"{}$"'.format(ns, tag.replace("+", " ")) + tag = f"{ns}:\"{tag.replace('+', ' ')}$\"" else: tag += "$" self.params = {"f_search": tag, "page": 0} @@ -553,13 +595,13 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): last = None page = self.request(search_url, params=params).text - for gallery in ExhentaiGalleryExtractor.pattern.finditer(page): - url = gallery.group(0) + for match in ExhentaiGalleryExtractor.pattern.finditer(page): + url = match[0] if url == last: continue last = url - data["gallery_id"] = text.parse_int(gallery.group(2)) - data["gallery_token"] = gallery.group(3) + data["gallery_id"] = text.parse_int(match[2]) + data["gallery_token"] = match[3] yield Message.Queue, url + "/", data next_url = text.extr(page, 'nexturl="', '"', None) diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index b284ee8..069ed99 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -6,10 +6,14 @@ """Extractors for https://www.facebook.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, exception +from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" +USER_PATTERN = (BASE_PATTERN + + r"/(?!media/|photo/|photo.php|watch/)" + r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)") class FacebookExtractor(Extractor): @@ -20,9 +24,6 @@ class FacebookExtractor(Extractor): filename_fmt = "{id}.{extension}" archive_fmt = "{id}.{extension}" - set_url_fmt = root + "/media/set/?set={set_id}" - photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}" - def _init(self): headers = self.session.headers headers["Accept"] = ( @@ -37,22 +38,20 @@ class FacebookExtractor(Extractor): self.videos = self.config("videos", True) self.author_followups = self.config("author-followups", False) - @staticmethod - def decode_all(txt): + def decode_all(self, txt): return text.unescape( txt.encode().decode("unicode_escape") .encode("utf_16", "surrogatepass").decode("utf_16") ).replace("\\/", "/") - @staticmethod - def parse_set_page(set_page): + def parse_set_page(self, set_page): directory = { "set_id": text.extr( set_page, '"mediaSetToken":"', '"' ) or text.extr( set_page, '"mediasetToken":"', '"' ), - "username": FacebookExtractor.decode_all( + "username": self.decode_all( text.extr( set_page, '"user":{"__isProfile":"User","name":"', '","' ) or text.extr( @@ -62,7 +61,7 @@ class FacebookExtractor(Extractor): "user_id": text.extr( set_page, '"owner":{"__typename":"User","id":"', '"' ), - "title": FacebookExtractor.decode_all(text.extr( + "title": self.decode_all(text.extr( set_page, '"title":{"text":"', '"' )), "first_photo_id": text.extr( @@ -77,8 +76,7 @@ class FacebookExtractor(Extractor): return directory - @staticmethod - def parse_photo_page(photo_page): + def parse_photo_page(self, photo_page): photo = { "id": text.extr( photo_page, '"__isNode":"Photo","id":"', '"' @@ -88,13 +86,13 @@ class FacebookExtractor(Extractor): '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=', '"' ).rsplit("&set=", 1)[-1], - "username": FacebookExtractor.decode_all(text.extr( + "username": self.decode_all(text.extr( photo_page, '"owner":{"__typename":"User","name":"', '"' )), "user_id": text.extr( photo_page, '"owner":{"__typename":"User","id":"', '"' ), - "caption": FacebookExtractor.decode_all(text.extr( + "caption": self.decode_all(text.extr( photo_page, '"message":{"delight_ranges"', '"},"message_preferred_body"' @@ -103,7 +101,7 @@ class FacebookExtractor(Extractor): text.extr(photo_page, '\\"publish_time\\":', ',') or text.extr(photo_page, '"created_time":', ',') ), - "url": FacebookExtractor.decode_all(text.extr( + "url": self.decode_all(text.extr( photo_page, ',"image":{"uri":"', '","' )), "next_photo_id": text.extr( @@ -133,8 +131,7 @@ class FacebookExtractor(Extractor): return photo - @staticmethod - def parse_post_page(post_page): + def parse_post_page(self, post_page): first_photo_url = text.extr( text.extr( post_page, '"__isMedia":"Photo"', '"target_group"' @@ -148,13 +145,12 @@ class FacebookExtractor(Extractor): return post - @staticmethod - def parse_video_page(video_page): + def parse_video_page(self, video_page): video = { "id": text.extr( video_page, '\\"video_id\\":\\"', '\\"' ), - "username": FacebookExtractor.decode_all(text.extr( + "username": self.decode_all(text.extr( video_page, '"actors":[{"__typename":"User","name":"', '","' )), "user_id": text.extr( @@ -167,7 +163,7 @@ class FacebookExtractor(Extractor): } if not video["username"]: - video["username"] = FacebookExtractor.decode_all(text.extr( + video["username"] = self.decode_all(text.extr( video_page, '"__typename":"User","id":"' + video["user_id"] + '","name":"', '","' @@ -179,7 +175,7 @@ class FacebookExtractor(Extractor): audio = { **video, - "url": FacebookExtractor.decode_all(text.extr( + "url": self.decode_all(text.extr( text.extr( first_video_raw, "AudioChannelConfiguration", @@ -196,7 +192,7 @@ class FacebookExtractor(Extractor): first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>' ): resolution = raw_url.split('\\"', 1)[0] - video["urls"][resolution] = FacebookExtractor.decode_all( + video["urls"][resolution] = self.decode_all( raw_url.split('BaseURL>', 1)[1] ) @@ -224,17 +220,16 @@ class FacebookExtractor(Extractor): res = self.request(url, **kwargs) if res.url.startswith(self.root + "/login"): - raise exception.AuthenticationError( - "You must be logged in to continue viewing images." + - LEFT_OFF_TXT + raise exception.AuthRequired( + message=(f"You must be logged in to continue viewing images." + f"{LEFT_OFF_TXT}") ) if b'{"__dr":"CometErrorRoot.react"}' in res.content: - raise exception.StopExtraction( - "You've been temporarily blocked from viewing images. " - "\nPlease try using a different account, " - "using a VPN or waiting before you retry." + - LEFT_OFF_TXT + raise exception.AbortExtraction( + f"You've been temporarily blocked from viewing images.\n" + f"Please try using a different account, " + f"using a VPN or waiting before you retry.{LEFT_OFF_TXT}" ) return res @@ -248,9 +243,7 @@ class FacebookExtractor(Extractor): while i < len(all_photo_ids): photo_id = all_photo_ids[i] - photo_url = self.photo_url_fmt.format( - photo_id=photo_id, set_id=set_id - ) + photo_url = f"{self.root}/photo/?fbid={photo_id}&set={set_id}" photo_page = self.photo_page_request_wrapper(photo_url).text photo = self.parse_photo_page(photo_page) @@ -302,6 +295,36 @@ class FacebookExtractor(Extractor): i += 1 + @memcache(keyarg=1) + def _extract_profile_photos_page(self, profile): + profile_photos_url = f"{self.root}/{profile}/photos_by" + + for _ in range(self.fallback_retries + 1): + profile_photos_page = self.request(profile_photos_url).text + if set_id := self._extract_profile_set_id(profile_photos_page): + break + self.log.debug("Got empty profile photos page, retrying...") + else: + raise exception.AbortExtraction("Failed to extract profile data") + + avatar_page_url = text.extr( + profile_photos_page, ',"profilePhoto":{"url":"', '"') + + return set_id, avatar_page_url.replace("\\/", "/") + + def _extract_profile_set_id(self, profile_photos_page): + set_ids_raw = text.extr( + profile_photos_page, '"pageItems"', '"page_info"' + ) + + set_id = text.extr( + set_ids_raw, 'set=', '"' + ).rsplit("&", 1)[0] or text.extr( + set_ids_raw, '\\/photos\\/', '\\/' + ) + + return set_id + class FacebookSetExtractor(FacebookExtractor): """Base class for Facebook Set extractors""" @@ -317,13 +340,12 @@ class FacebookSetExtractor(FacebookExtractor): def items(self): set_id = self.groups[0] or self.groups[3] - path = self.groups[1] - if path: + if path := self.groups[1]: post_url = self.root + "/" + path post_page = self.request(post_url).text set_id = self.parse_post_page(post_page)["set_id"] - set_url = self.set_url_fmt.format(set_id=set_id) + set_url = f"{self.root}/media/set/?set={set_id}" set_page = self.request(set_url).text set_data = self.parse_set_page(set_page) if self.groups[2]: @@ -342,16 +364,15 @@ class FacebookPhotoExtractor(FacebookExtractor): def items(self): photo_id = self.groups[0] - photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="") + photo_url = f"{self.root}/photo/?fbid={photo_id}&set=" photo_page = self.photo_page_request_wrapper(photo_url).text i = 1 photo = self.parse_photo_page(photo_page) photo["num"] = i - set_page = self.request( - self.set_url_fmt.format(set_id=photo["set_id"]) - ).text + set_url = f"{self.root}/media/set/?set={photo['set_id']}" + set_page = self.request(set_url).text directory = self.parse_set_page(set_page) @@ -362,9 +383,7 @@ class FacebookPhotoExtractor(FacebookExtractor): for comment_photo_id in photo["followups_ids"]: comment_photo = self.parse_photo_page( self.photo_page_request_wrapper( - self.photo_url_fmt.format( - photo_id=comment_photo_id, set_id="" - ) + f"{self.root}/photo/?fbid={comment_photo_id}&set=" ).text ) i += 1 @@ -399,44 +418,50 @@ class FacebookVideoExtractor(FacebookExtractor): yield Message.Url, audio["url"], audio -class FacebookProfileExtractor(FacebookExtractor): - """Base class for Facebook Profile Photos Set extractors""" - subcategory = "profile" - pattern = ( - BASE_PATTERN + - r"/(?!media/|photo/|photo.php|watch/)" - r"(?:profile\.php\?id=|people/[^/?#]+/)?" - r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)" - ) - example = "https://www.facebook.com/USERNAME" +class FacebookPhotosExtractor(FacebookExtractor): + """Extractor for Facebook Profile Photos""" + subcategory = "photos" + pattern = USER_PATTERN + r"/photos(?:_by)?" + example = "https://www.facebook.com/USERNAME/photos" - @staticmethod - def get_profile_photos_set_id(profile_photos_page): - set_ids_raw = text.extr( - profile_photos_page, '"pageItems"', '"page_info"' - ) + def items(self): + set_id = self._extract_profile_photos_page(self.groups[0])[0] + set_url = f"{self.root}/media/set/?set={set_id}" + set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + return self.extract_set(set_data) - set_id = text.extr( - set_ids_raw, 'set=', '"' - ).rsplit("&", 1)[0] or text.extr( - set_ids_raw, '\\/photos\\/', '\\/' - ) - return set_id +class FacebookAvatarExtractor(FacebookExtractor): + """Extractor for Facebook Profile Avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://www.facebook.com/USERNAME/avatar" def items(self): - profile_photos_url = ( - self.root + "/" + self.groups[0] + "/photos_by" - ) - profile_photos_page = self.request(profile_photos_url).text + avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1] + avatar_page = self.photo_page_request_wrapper(avatar_page_url).text - set_id = self.get_profile_photos_set_id(profile_photos_page) + avatar = self.parse_photo_page(avatar_page) + avatar["count"] = avatar["num"] = 1 + avatar["type"] = "avatar" - if set_id: - set_url = self.set_url_fmt.format(set_id=set_id) - set_page = self.request(set_url).text - set_data = self.parse_set_page(set_page) - return self.extract_set(set_data) + set_url = f"{self.root}/media/set/?set={avatar['set_id']}" + set_page = self.request(set_url).text + directory = self.parse_set_page(set_page) - self.log.debug("Profile photos set ID not found.") - return iter(()) + yield Message.Directory, directory + yield Message.Url, avatar["url"], avatar + + +class FacebookUserExtractor(Dispatch, FacebookExtractor): + """Extractor for Facebook Profiles""" + pattern = USER_PATTERN + r"/?(?:$|\?|#)" + example = "https://www.facebook.com/USERNAME" + + def items(self): + base = f"{self.root}/{self.groups[0]}/" + return self._dispatch_extractors(( + (FacebookAvatarExtractor, base + "avatar"), + (FacebookPhotosExtractor, base + "photos"), + ), ("photos",)) diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 8981c29..70b06e7 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -7,9 +7,8 @@ """Extractors for https://www.fanbox.cc/""" from .common import Extractor, Message -from .. import text +from .. import text, util from ..cache import memcache -import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?fanbox\.cc" USER_PATTERN = ( @@ -41,8 +40,7 @@ class FanboxExtractor(Extractor): } self.embeds = self.config("embeds", True) - includes = self.config("metadata") - if includes: + if includes := self.config("metadata"): if isinstance(includes, str): includes = includes.split(",") elif not isinstance(includes, (list, tuple)): @@ -62,7 +60,23 @@ class FanboxExtractor(Extractor): FanboxExtractor._warning = False def items(self): - for content_body, post in self.posts(): + fee_max = self.config("fee-max") + + for item in self.posts(): + if fee_max is not None and fee_max < item["feeRequired"]: + self.log.warning("Skipping post %s (feeRequired of %s > %s)", + item["id"], item["feeRequired"], fee_max) + continue + + try: + url = "https://api.fanbox.cc/post.info?postId=" + item["id"] + body = self.request_json(url, headers=self.headers)["body"] + content_body, post = self._extract_post(body) + except Exception as exc: + self.log.warning("Skipping post %s (%s: %s)", + item["id"], exc.__class__.__name__, exc) + continue + yield Message.Directory, post yield from self._get_urls_from_post(content_body, post) @@ -72,22 +86,17 @@ class FanboxExtractor(Extractor): def _pagination(self, url): while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=self.headers).json()["body"] - for item in body["items"]: - try: - yield self._get_post_data(item["id"]) - except Exception as exc: - self.log.warning("Skipping post %s (%s: %s)", - item["id"], exc.__class__.__name__, exc) + body = self.request_json(url, headers=self.headers)["body"] + + yield from body["items"] + url = body["nextUrl"] - def _get_post_data(self, post_id): + def _extract_post(self, post): """Fetch and process post data""" - url = "https://api.fanbox.cc/post.info?postId="+post_id - post = self.request(url, headers=self.headers).json()["body"] + post["archives"] = () - content_body = post.pop("body", None) - if content_body: + if content_body := post.pop("body", None): if "html" in content_body: post["html"] = content_body["html"] if post["type"] == "article": @@ -95,29 +104,30 @@ class FanboxExtractor(Extractor): if "blocks" in content_body: content = [] # text content images = [] # image IDs in 'body' order + files = [] # file IDs in 'body' order - append = content.append - append_img = images.append for block in content_body["blocks"]: if "text" in block: - append(block["text"]) + content.append(block["text"]) if "links" in block: for link in block["links"]: - append(link["url"]) + content.append(link["url"]) if "imageId" in block: - append_img(block["imageId"]) - - if images and "imageMap" in content_body: - # reorder 'imageMap' (#2718) - image_map = content_body["imageMap"] - content_body["imageMap"] = { - image_id: image_map[image_id] - for image_id in images - if image_id in image_map - } + images.append(block["imageId"]) + if "fileId" in block: + files.append(block["fileId"]) post["content"] = "\n".join(content) + self._sort_map(content_body, "imageMap", images) + if file_map := self._sort_map(content_body, "fileMap", files): + exts = util.EXTS_ARCHIVE + post["archives"] = [ + file + for file in file_map.values() + if file.get("extension", "").lower() in exts + ] + post["date"] = text.parse_datetime(post["publishedDatetime"]) post["text"] = content_body.get("text") if content_body else None post["isCoverImage"] = False @@ -130,8 +140,7 @@ class FanboxExtractor(Extractor): try: post["plan"] = plans[fee] except KeyError: - fees = [f for f in plans if f >= fee] - if fees: + if fees := [f for f in plans if f >= fee]: plan = plans[min(fees)] else: plan = plans[0].copy() @@ -139,17 +148,30 @@ class FanboxExtractor(Extractor): post["plan"] = plans[fee] = plan if self._meta_comments: if post["commentCount"]: - post["comments"] = list(self._get_comment_data(post_id)) + post["comments"] = list(self._get_comment_data(post["id"])) else: post["commentd"] = () return content_body, post + def _sort_map(self, body, key, ids): + orig = body.get(key) + if not orig: + return {} if orig is None else orig + + body[key] = new = { + id: orig[id] + for id in ids + if id in orig + } + + return new + @memcache(keyarg=1) def _get_user_data(self, creator_id): url = "https://api.fanbox.cc/creator.get" params = {"creatorId": creator_id} - data = self.request(url, params=params, headers=self.headers).json() + data = self.request_json(url, params=params, headers=self.headers) user = data["body"] user.update(user.pop("user")) @@ -160,7 +182,7 @@ class FanboxExtractor(Extractor): def _get_plan_data(self, creator_id): url = "https://api.fanbox.cc/plan.listCreator" params = {"creatorId": creator_id} - data = self.request(url, params=params, headers=self.headers).json() + data = self.request_json(url, params=params, headers=self.headers) plans = {0: { "id" : "", @@ -185,7 +207,7 @@ class FanboxExtractor(Extractor): comments = [] while url: url = text.ensure_http_scheme(url) - body = self.request(url, headers=self.headers).json()["body"] + body = self.request_json(url, headers=self.headers)["body"] data = body["commentList"] comments.extend(data["items"]) url = data["nextUrl"] @@ -193,9 +215,8 @@ class FanboxExtractor(Extractor): def _get_urls_from_post(self, content_body, post): num = 0 - cover_image = post.get("coverImageUrl") - if cover_image: - cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image) + if cover_image := post.get("coverImageUrl"): + cover_image = util.re("/c/[0-9a-z_]+").sub("", cover_image) final_post = post.copy() final_post["isCoverImage"] = True final_post["fileUrl"] = cover_image @@ -313,10 +334,10 @@ class FanboxExtractor(Extractor): elif provider == "twitter": url = "https://twitter.com/_/status/"+content_id elif provider == "google_forms": - templ = "https://docs.google.com/forms/d/e/{}/viewform?usp=sf_link" - url = templ.format(content_id) + url = (f"https://docs.google.com/forms/d/e/" + f"{content_id}/viewform?usp=sf_link") else: - self.log.warning("service not recognized: {}".format(provider)) + self.log.warning(f"service not recognized: {provider}") if url: final_post["embed"] = embed @@ -334,25 +355,16 @@ class FanboxCreatorExtractor(FanboxExtractor): pattern = USER_PATTERN + r"(?:/posts)?/?$" example = "https://USER.fanbox.cc/" - def __init__(self, match): - FanboxExtractor.__init__(self, match) - self.creator_id = match.group(1) or match.group(2) - def posts(self): url = "https://api.fanbox.cc/post.paginateCreator?creatorId=" - return self._pagination_creator(url + self.creator_id) + creator_id = self.groups[0] or self.groups[1] + return self._pagination_creator(url + creator_id) def _pagination_creator(self, url): - urls = self.request(url, headers=self.headers).json()["body"] + urls = self.request_json(url, headers=self.headers)["body"] for url in urls: url = text.ensure_http_scheme(url) - body = self.request(url, headers=self.headers).json()["body"] - for item in body: - try: - yield self._get_post_data(item["id"]) - except Exception as exc: - self.log.warning("Skipping post %s (%s: %s)", - item["id"], exc.__class__.__name__, exc) + yield from self.request_json(url, headers=self.headers)["body"] class FanboxPostExtractor(FanboxExtractor): @@ -361,12 +373,8 @@ class FanboxPostExtractor(FanboxExtractor): pattern = USER_PATTERN + r"/posts/(\d+)" example = "https://USER.fanbox.cc/posts/12345" - def __init__(self, match): - FanboxExtractor.__init__(self, match) - self.post_id = match.group(3) - def posts(self): - return (self._get_post_data(self.post_id),) + return ({"id": self.groups[2], "feeRequired": 0},) class FanboxHomeExtractor(FanboxExtractor): diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 6218f19..e32a86b 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -93,7 +93,7 @@ class FantiaExtractor(Extractor): def _get_post_data(self, post_id): """Fetch and process post data""" url = self.root+"/api/v1/posts/"+post_id - resp = self.request(url, headers=self.headers).json()["post"] + resp = self.request_json(url, headers=self.headers)["post"] return { "post_id": resp["id"], "post_url": self.root + "/posts/" + str(resp["id"]), @@ -181,10 +181,10 @@ class FantiaCreatorExtractor(FantiaExtractor): def __init__(self, match): FantiaExtractor.__init__(self, match) - self.creator_id = match.group(1) + self.creator_id = match[1] def posts(self): - url = "{}/fanclubs/{}/posts".format(self.root, self.creator_id) + url = f"{self.root}/fanclubs/{self.creator_id}/posts" return self._pagination(url) @@ -196,7 +196,7 @@ class FantiaPostExtractor(FantiaExtractor): def __init__(self, match): FantiaExtractor.__init__(self, match) - self.post_id = match.group(1) + self.post_id = match[1] def posts(self): self._csrf_token() diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py index 43627e2..7ff71b0 100644 --- a/gallery_dl/extractor/fapachi.py +++ b/gallery_dl/extractor/fapachi.py @@ -31,8 +31,7 @@ class FapachiPostExtractor(Extractor): "user": self.user, "id" : self.id, } - page = self.request("{}/{}/media/{}".format( - self.root, self.user, self.id)).text + page = self.request(f"{self.root}/{self.user}/media/{self.id}").text url = self.root + text.extract( page, 'data-src="', '"', page.index('class="media-img'))[0] yield Message.Directory, data @@ -50,17 +49,16 @@ class FapachiUserExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) - self.num = text.parse_int(match.group(2), 1) + self.user = match[1] + self.num = text.parse_int(match[2], 1) def items(self): data = {"_extractor": FapachiPostExtractor} while True: - page = self.request("{}/{}/page/{}".format( - self.root, self.user, self.num)).text + url = f"{self.root}/{self.user}/page/{self.num}" + page = self.request(url).text for post in text.extract_iter(page, 'model-media-prew">', ">"): - path = text.extr(post, 'Next page' not in page: diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index cf18edc..b961cbe 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -25,11 +25,11 @@ class FapelloPostExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(match[0]) self.model, self.id = match.groups() def items(self): - url = "{}/{}/{}/".format(self.root, self.model, self.id) + url = f"{self.root}/{self.model}/{self.id}/" page = text.extr( self.request(url, allow_redirects=False).text, 'class="uk-align-center"', "
", None) @@ -59,15 +59,14 @@ class FapelloModelExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) - self.model = match.group(1) + self.root = text.root_from_url(match[0]) + self.model = match[1] def items(self): num = 1 data = {"_extractor": FapelloPostExtractor} while True: - url = "{}/ajax/model/{}/page-{}/".format( - self.root, self.model, num) + url = f"{self.root}/ajax/model/{self.model}/page-{num}/" page = self.request(url).text if not page: return @@ -93,8 +92,8 @@ class FapelloPathExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.root = text.root_from_url(match.group(0)) - self.path = match.group(1) + self.root = text.root_from_url(match[0]) + self.path = match[1] def items(self): num = 1 @@ -109,8 +108,8 @@ class FapelloPathExtractor(Extractor): data = {"_extractor": FapelloModelExtractor} while True: - page = self.request("{}/ajax/{}/page-{}/".format( - self.root, self.path, num)).text + url = f"{self.root}/ajax/{self.path}/page-{num}/" + page = self.request(url).text if not page: return diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index eb68c3e..35263a3 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 Mike Fährmann +# Copyright 2017-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -134,8 +134,8 @@ class FlickrAlbumExtractor(FlickrExtractor): for album in self.api.photosets_getList(self.user["nsid"]): self.api._clean_info(album).update(data) - url = "https://www.flickr.com/photos/{}/albums/{}".format( - self.user["path_alias"], album["id"]) + url = (f"https://www.flickr.com/photos/{self.user['path_alias']}" + f"/albums/{album['id']}") yield Message.Queue, url, album def metadata(self): @@ -451,14 +451,13 @@ class FlickrAPI(oauth.OAuth1API): raise exception.AuthenticationError(msg) elif data["code"] == 99: raise exception.AuthorizationError(msg) - raise exception.StopExtraction("API request failed: %s", msg) + raise exception.AbortExtraction(f"API request failed: {msg}") return data def _pagination(self, method, params, key="photos"): extras = ("description,date_upload,tags,views,media," "path_alias,owner_name,") - includes = self.extractor.config("metadata") - if includes: + if includes := self.extractor.config("metadata"): if isinstance(includes, (list, tuple)): includes = ",".join(includes) elif not isinstance(includes, str): @@ -585,8 +584,7 @@ class FlickrAPI(oauth.OAuth1API): if "license" in photo: photo["license_name"] = self.LICENSES.get(photo["license"]) - @staticmethod - def _clean_info(info): + def _clean_info(self, info): info["title"] = info["title"]["_content"] info["description"] = info["description"]["_content"] return info diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 5f90afc..dc23488 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,9 @@ class FoolfuukaExtractor(BaseExtractor): self.remote = self._remote_direct elif self.category == "archivedmoe": self.referer = False + self.fixup_redirect = True + else: + self.fixup_redirect = False def items(self): yield Message.Directory, self.metadata() @@ -57,13 +60,45 @@ class FoolfuukaExtractor(BaseExtractor): """Resolve a remote media link""" page = self.request(media["remote_media_link"]).text url = text.extr(page, 'http-equiv="Refresh" content="0; url=', '"') - if url.endswith(".webm") and \ - url.startswith("https://thebarchive.com/"): - return url[:-1] + + if url.startswith("https://thebarchive.com/"): + # '.webm' -> '.web' (#5116) + if url.endswith(".webm"): + url = url[:-1] + + elif self.fixup_redirect: + # update redirect domain or filename (#7652) + path, _, filename = url.rpartition("/") + + # these boards link directly to i.4cdn.org + # -> redirect to warosu or 4plebs instead + board_domains = { + "3" : "warosu.org", + "biz": "warosu.org", + "ck" : "warosu.org", + "diy": "warosu.org", + "fa" : "warosu.org", + "ic" : "warosu.org", + "jp" : "warosu.org", + "lit": "warosu.org", + "sci": "warosu.org", + "tg" : "archive.4plebs.org", + } + board = url.split("/", 4)[3] + if board in board_domains: + domain = board_domains[board] + url = f"https://{domain}/{board}/full_image/{filename}" + + # if it's one of these archives, slice the name + elif any(archive in path for archive in ( + "b4k.", "desuarchive.", "palanq.")): + name, _, ext = filename.rpartition(".") + if len(name) > 13: + url = f"{path}/{name[:13]}.{ext}" + return url - @staticmethod - def _remote_direct(media): + def _remote_direct(self, media): return media["remote_media_link"] @@ -124,13 +159,12 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): def metadata(self): url = self.root + "/_/api/chan/thread/" params = {"board": self.board, "num": self.thread} - self.data = self.request(url, params=params).json()[self.thread] + self.data = self.request_json(url, params=params)[self.thread] return self.data["op"] def posts(self): op = (self.data["op"],) - posts = self.data.get("posts") - if posts: + if posts := self.data.get("posts"): posts = list(posts.values()) posts.sort(key=lambda p: p["timestamp"]) return itertools.chain(op, posts) @@ -149,13 +183,12 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): self.page = self.groups[-1] def items(self): - index_base = "{}/_/api/chan/index/?board={}&page=".format( - self.root, self.board) - thread_base = "{}/{}/thread/".format(self.root, self.board) + index_base = f"{self.root}/_/api/chan/index/?board={self.board}&page=" + thread_base = f"{self.root}/{self.board}/thread/" page = self.page for pnum in itertools.count(text.parse_int(page, 1)): - with self.request(index_base + format(pnum)) as response: + with self.request(index_base + str(pnum)) as response: try: threads = response.json() except ValueError: @@ -209,7 +242,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): while True: try: - data = self.request(url, params=params).json() + data = self.request_json(url, params=params) except ValueError: return @@ -235,27 +268,17 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/gallery(?:/(\d+))?" example = "https://archived.moe/a/gallery" - def __init__(self, match): - FoolfuukaExtractor.__init__(self, match) - - board = match.group(match.lastindex) - if board.isdecimal(): - self.board = match.group(match.lastindex-1) - self.pages = (board,) - else: - self.board = board - self.pages = map(format, itertools.count(1)) - def metadata(self): - return {"board": self.board} + self.board = board = self.groups[-2] + return {"board": board} def posts(self): - base = "{}/_/api/chan/gallery/?board={}&page=".format( - self.root, self.board) + pnum = self.groups[-1] + pages = itertools.count(1) if pnum is None else (pnum,) + base = f"{self.root}/_/api/chan/gallery/?board={self.board}&page=" - for page in self.pages: - with self.request(base + page) as response: - posts = response.json() + for pnum in pages: + posts = self.request_json(f"{base}{pnum}") if not posts: return yield from posts diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index bb684c2..7c59f72 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -18,14 +18,13 @@ class FoolslideExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.gallery_url = self.root + match.group(match.lastindex) + self.page_url = self.root + self.groups[-1] def request(self, url): return BaseExtractor.request( self, url, encoding="utf-8", method="POST", data={"adult": "true"}) - @staticmethod - def parse_chapter_url(url, data): + def parse_chapter_url(self, url, data): info = url.partition("/read/")[2].rstrip("/").split("/") lang = info[1].partition("-")[0] data["lang"] = lang @@ -52,7 +51,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): example = "https://read.powermanga.org/read/MANGA/en/0/123/" def items(self): - page = self.request(self.gallery_url).text + page = self.request(self.page_url).text data = self.metadata(page) imgs = self.images(page) @@ -79,7 +78,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): def metadata(self, page): extr = text.extract_from(page) extr('

', '') - return self.parse_chapter_url(self.gallery_url, { + return self.parse_chapter_url(self.page_url, { "manga" : text.unescape(extr('title="', '"')).strip(), "chapter_string": text.unescape(extr('title="', '"')), }) @@ -96,7 +95,7 @@ class FoolslideMangaExtractor(FoolslideExtractor): example = "https://read.powermanga.org/series/MANGA/" def items(self): - page = self.request(self.gallery_url).text + page = self.request(self.page_url).text chapters = self.chapters(page) if not self.config("chapter-reverse", False): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 565fd71..0d24f83 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://www.furaffinity.net/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" @@ -28,7 +28,7 @@ class FuraffinityExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user = match.group(1) + self.user = match[1] self.offset = 0 def _init(self): @@ -51,8 +51,7 @@ class FuraffinityExtractor(Extractor): def items(self): metadata = self.metadata() for post_id in util.advance(self.posts(), self.offset): - post = self._parse_post(post_id) - if post: + if post := self._parse_post(post_id): if metadata: post.update(metadata) yield Message.Directory, post @@ -71,7 +70,7 @@ class FuraffinityExtractor(Extractor): return num def _parse_post(self, post_id): - url = "{}/view/{}/".format(self.root, post_id) + url = f"{self.root}/view/{post_id}/" extr = text.extract_from(self.request(url).text) if self._new_layout is None: @@ -117,8 +116,7 @@ class FuraffinityExtractor(Extractor): data["folders"] = folders = [] for folder in extr( "

Listed in Folders

", "").split(""): - folder = rh(folder) - if folder: + if folder := rh(folder): folders.append(folder) else: # old site layout @@ -147,22 +145,19 @@ class FuraffinityExtractor(Extractor): data["user"] = self.user or data["artist_url"] data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) - data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format( - post_id, path.rsplit("/", 2)[1]) - + data["thumbnail"] = (f"https://t.furaffinity.net/{post_id}@600-" + f"{path.rsplit('/', 2)[1]}.jpg") return data - @staticmethod - def _process_description(description): + def _process_description(self, description): return text.unescape(text.remove_html(description, "", "")) def _pagination(self, path, folder=None): num = 1 - folder = "" if folder is None else "/folder/{}/a".format(folder) + folder = "" if folder is None else f"/folder/{folder}/a" while True: - url = "{}/{}/{}{}/{}/".format( - self.root, path, self.user, folder, num) + url = f"{self.root}/{path}/{self.user}{folder}/{num}/" page = self.request(url).text post_id = None @@ -174,7 +169,7 @@ class FuraffinityExtractor(Extractor): num += 1 def _pagination_favorites(self): - path = "/favorites/{}/".format(self.user) + path = f"/favorites/{self.user}/" while path: page = self.request(self.root + path).text @@ -188,7 +183,7 @@ class FuraffinityExtractor(Extractor): pos = page.find('type="submit">Next') if pos >= 0: - path = text.rextract(page, '
Next 48")) < 0 and \ + (pos := page.find(">>>> Next 48 >>")) < 0: return + + path = text.rextr(page, 'href="', '"', pos) url = self.root + text.unescape(path) diff --git a/gallery_dl/extractor/furry34.py b/gallery_dl/extractor/furry34.py index e0c7fdb..a93ec75 100644 --- a/gallery_dl/extractor/furry34.py +++ b/gallery_dl/extractor/furry34.py @@ -46,8 +46,8 @@ class Furry34Extractor(BooruExtractor): post_id = post["id"] root = self.root_cdn if files[fmt][0] else self.root - post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( - root, post_id // 1000, post_id, post_id, extension) + post["file_url"] = url = \ + f"{root}/posts/{post_id // 1000}/{post_id}/{post_id}.{extension}" post["format_id"] = fmt post["format"] = extension.partition(".")[0] @@ -73,11 +73,11 @@ class Furry34Extractor(BooruExtractor): post["tags_" + types[type]] = values def _fetch_post(self, post_id): - url = "{}/api/v2/post/{}".format(self.root, post_id) - return self.request(url).json() + url = f"{self.root}/api/v2/post/{post_id}" + return self.request_json(url) def _pagination(self, endpoint, params=None): - url = "{}/api{}".format(self.root, endpoint) + url = f"{self.root}/api{endpoint}" if params is None: params = {} @@ -86,7 +86,7 @@ class Furry34Extractor(BooruExtractor): threshold = self.per_page while True: - data = self.request(url, method="POST", json=params).json() + data = self.request_json(url, method="POST", json=params) yield from data["items"] diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py index beecbff..b7cf0c8 100644 --- a/gallery_dl/extractor/fuskator.py +++ b/gallery_dl/extractor/fuskator.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,13 +21,13 @@ class FuskatorGalleryExtractor(GalleryExtractor): example = "https://fuskator.com/thumbs/ID/" def __init__(self, match): - self.gallery_hash = match.group(1) - url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash) + self.gallery_hash = match[1] + url = f"{self.root}/thumbs/{self.gallery_hash}/index.html" GalleryExtractor.__init__(self, match, url) def metadata(self, page): headers = { - "Referer" : self.gallery_url, + "Referer" : self.page_url, "X-Requested-With": "XMLHttpRequest", } auth = self.request( @@ -39,9 +39,8 @@ class FuskatorGalleryExtractor(GalleryExtractor): "hash" : self.gallery_hash, "_" : int(time.time()), } - self.data = data = self.request( - self.root + "/ajax/gal.aspx", params=params, headers=headers, - ).json() + self.data = data = self.request_json( + self.root + "/ajax/gal.aspx", params=params, headers=headers) title = text.extr(page, "", "").strip() title, _, gallery_id = title.rpartition("#") @@ -72,7 +71,7 @@ class FuskatorSearchExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.path = match.group(1) + self.path = match[1] def items(self): url = self.root + self.path @@ -87,4 +86,4 @@ class FuskatorSearchExtractor(Extractor): pages = text.extr(page, 'class="pages">', '>>><') if not pages: return - url = self.root + text.rextract(pages, 'href="', '"')[0] + url = self.root + text.rextr(pages, 'href="', '"') diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index f24b696..b152885 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,11 +26,19 @@ class GelbooruBase(): def _api_request(self, params, key="post", log=False): if "s" not in params: params["s"] = "post" + params["api_key"] = self.api_key params["user_id"] = self.user_id url = self.root + "/index.php?page=dapi&q=index&json=1" - data = self.request(url, params=params).json() + try: + data = self.request_json(url, params=params) + except exception.HttpError as exc: + if exc.status == 401: + raise exception.AuthorizationError( + f"'api-key' and 'user-id' required " + f"({exc.status}: {exc.response.reason})") + raise if not key: return data @@ -73,7 +81,7 @@ class GelbooruBase(): if id: tag = "id:" + op tags = [t for t in tags if not t.startswith(tag)] - tags = "{} id:{}".format(" ".join(tags), op) + tags = f"{' '.join(tags)} id:{op}" while True: posts = self._api_request(params) @@ -113,7 +121,7 @@ class GelbooruBase(): post["_fallback"] = (url,) md5 = post["md5"] root = text.root_from_url(post["preview_url"]) - path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5) + path = f"/images/{md5[0:2]}/{md5[2:4]}/{md5}.webm" url = root + path return url @@ -292,7 +300,7 @@ class GelbooruRedirectExtractor(GelbooruBase, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.url_base64 = match.group(1) + self.url_base64 = match[1] def items(self): url = text.ensure_http_scheme(binascii.a2b_base64( diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 0b96048..61d0545 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2023 Mike Fährmann +# Copyright 2021-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,8 +17,7 @@ class GelbooruV01Extractor(booru.BooruExtractor): per_page = 20 def _parse_post(self, post_id): - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post_id) + url = f"{self.root}/index.php?page=post&s=view&id={post_id}" extr = text.extract_from(self.request(url).text) post = { @@ -92,16 +91,12 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" example = "https://allgirl.booru.org/index.php?page=post&s=list&tags=TAG" - def __init__(self, match): - GelbooruV01Extractor.__init__(self, match) - self.tags = match.group(match.lastindex) - def metadata(self): - return {"search_tags": text.unquote(self.tags.replace("+", " "))} + self.tags = tags = self.groups[-1] + return {"search_tags": text.unquote(tags.replace("+", " "))} def posts(self): - url = "{}/index.php?page=post&s=list&tags={}&pid=".format( - self.root, self.tags) + url = f"{self.root}/index.php?page=post&s=list&tags={self.tags}&pid=" return self._pagination(url, 'class="thumb">Pool: ", "

") @@ -239,12 +243,9 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=favorites&s=view&id=12345" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - self.favorite_id = match.group(match.lastindex) - def metadata(self): - return {"favorite_id": text.parse_int(self.favorite_id)} + self.favorite_id = fav_id = self.groups[-1] + return {"favorite_id": text.parse_int(fav_id)} def posts(self): return self._pagination_html({ @@ -260,9 +261,5 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" example = "https://safebooru.org/index.php?page=post&s=view&id=12345" - def __init__(self, match): - GelbooruV02Extractor.__init__(self, match) - self.post_id = match.group(match.lastindex) - def posts(self): - return self._pagination({"id": self.post_id}) + return self._pagination({"id": self.groups[-1]}) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 4b04732..407e478 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -7,9 +7,8 @@ """Generic information extractor""" from .common import Extractor, Message -from .. import config, text +from .. import config, text, util import os.path -import re class GenericExtractor(Extractor): @@ -37,28 +36,28 @@ class GenericExtractor(Extractor): example = "generic:https://www.nongnu.org/lzip/" def __init__(self, match): - self.subcategory = match.group('domain') + self.subcategory = match['domain'] Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode - if match.group('generic'): - self.url = match.group(0).partition(":")[2] + if match['generic']: + self.url = match[0].partition(":")[2] else: self.log.info("Falling back on generic information extractor.") - self.url = match.group(0) + self.url = match[0] # Make sure we have a scheme, or use https - if match.group('scheme'): - self.scheme = match.group('scheme') + if match['scheme']: + self.scheme = match['scheme'] else: self.scheme = 'https://' self.url = text.ensure_http_scheme(self.url, self.scheme) - self.path = match.group('path') + self.path = match['path'] # Used to resolve relative image urls - self.root = self.scheme + match.group('domain') + self.root = self.scheme + match['domain'] def items(self): """Get page, extract metadata & images, yield them in suitable messages @@ -172,8 +171,8 @@ class GenericExtractor(Extractor): r"(?:[^\"'<>\s]*)?" # optional query and fragment ) - imageurls_src = re.findall(imageurl_pattern_src, page) - imageurls_ext = re.findall(imageurl_pattern_ext, page) + imageurls_src = util.re(imageurl_pattern_src).findall(page) + imageurls_ext = util.re(imageurl_pattern_ext).findall(page) imageurls = imageurls_src + imageurls_ext # Resolve relative urls @@ -182,10 +181,10 @@ class GenericExtractor(Extractor): # by prepending a suitable base url. # # If the page contains a element, use it as base url - basematch = re.search( - r"(?i)(?:[^\"' >]+)", page) + basematch = util.re( + r"(?i)(?:[^\"' >]+)").search(page) if basematch: - self.baseurl = basematch.group('url').rstrip('/') + self.baseurl = basematch['url'].rstrip('/') # Otherwise, extract the base url from self.url else: if self.url.endswith("/"): diff --git a/gallery_dl/extractor/girlsreleased.py b/gallery_dl/extractor/girlsreleased.py new file mode 100644 index 0000000..4fc77c6 --- /dev/null +++ b/gallery_dl/extractor/girlsreleased.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://girlsreleased.com/""" + +from .common import Extractor, Message +from .. import text +import itertools + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlsreleased\.com" + + +class GirlsreleasedExtractor(Extractor): + """Base class for girlsreleased extractors""" + category = "girlsreleased" + root = "https://girlsreleased.com" + request_interval = 0.5 + request_interval_min = 0.2 + + def items(self): + data = {"_extractor": GirlsreleasedSetExtractor} + base = f"{self.root}/set/" + for set in self._pagination(): + yield Message.Queue, f"{base}{set[0]}", data + + def _pagination(self): + base = f"{self.root}/api/0.1/sets/{self._path}/{self.groups[0]}/page/" + for pnum in itertools.count(): + sets = self.request_json(f"{base}{pnum}")["sets"] + if not sets: + return + + yield from sets[1:] if pnum else sets + if len(sets) < 80: + return + + +class GirlsreleasedSetExtractor(GirlsreleasedExtractor): + """Extractor for girlsreleased galleries""" + subcategory = "set" + pattern = BASE_PATTERN + r"/set/(\d+)" + example = "https://girlsreleased.com/set/12345" + + def items(self): + url = f"{self.root}/api/0.1/set/{self.groups[0]}" + json = self.request_json(url)["set"] + data = { + "title": json["name"] or json["id"], + "id": json["id"], + "site": json["site"], + "model": [model for _, model in json["models"]], + "date": text.parse_timestamp(json["date"]), + "count": len(json["images"]), + "url": "https://girlsreleased.com/set/" + json["id"], + } + yield Message.Directory, data + for data["num"], image in enumerate(json["images"], 1): + text.nameext_from_url(image[5], data) + yield Message.Queue, image[3], data + + +class GirlsreleasedModelExtractor(GirlsreleasedExtractor): + """Extractor for girlsreleased models""" + subcategory = _path = "model" + pattern = BASE_PATTERN + r"/model/(\d+(?:/.+)?)" + example = "https://girlsreleased.com/model/12345/MODEL" + + +class GirlsreleasedSiteExtractor(GirlsreleasedExtractor): + """Extractor for girlsreleased sites""" + subcategory = _path = "site" + pattern = BASE_PATTERN + r"/site/([^/?#]+(?:/model/\d+/?.*)?)" + example = "https://girlsreleased.com/site/SITE" diff --git a/gallery_dl/extractor/girlswithmuscle.py b/gallery_dl/extractor/girlswithmuscle.py new file mode 100644 index 0000000..51b979f --- /dev/null +++ b/gallery_dl/extractor/girlswithmuscle.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?girlswithmuscle\.com" + + +class GirlswithmuscleExtractor(Extractor): + """Base class for girlswithmuscle extractors""" + category = "girlswithmuscle" + root = "https://www.girlswithmuscle.com" + directory_fmt = ("{category}", "{model}") + filename_fmt = "{model}_{id}.{extension}" + archive_fmt = "{type}_{model}_{id}" + + def login(self): + username, password = self._get_auth_info() + if username: + self.cookies_update(self._login_impl(username, password)) + + @cache(maxage=14*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/login/" + page = self.request(url).text + csrf_token = text.extr(page, 'name="csrfmiddlewaretoken" value="', '"') + + headers = { + "Origin" : self.root, + "Referer": url, + } + data = { + "csrfmiddlewaretoken": csrf_token, + "username": username, + "password": password, + "next": "/", + } + response = self.request( + url, method="POST", headers=headers, data=data) + + if not response.history: + raise exception.AuthenticationError() + + page = response.text + if ">Wrong username or password" in page: + raise exception.AuthenticationError() + if ">Log in<" in page: + raise exception.AuthenticationError("Account data is missing") + + return {c.name: c.value for c in response.history[0].cookies} + + +class GirlswithmusclePostExtractor(GirlswithmuscleExtractor): + """Extractor for individual posts on girlswithmuscle.com""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(\d+)" + example = "https://www.girlswithmuscle.com/12345/" + + def items(self): + self.login() + + url = f"{self.root}/{self.groups[0]}/" + page = self.request(url).text + if not page: + raise exception.NotFoundError("post") + + metadata = self.metadata(page) + + if url := text.extr(page, 'class="main-image" src="', '"'): + metadata["type"] = "picture" + else: + url = text.extr(page, '', "")) + image_info = text.extr( + page, '
', "
") + uploader = text.remove_html(text.extr( + image_info, '', "
")) + + tags = text.extr(page, 'id="tags-text">', "") + score = text.parse_int(text.remove_html(text.extr( + page, "Score: ", "", "") + return "unknown" if model.startswith("Picture #") else model + + def _parse_model_list(self, model): + if model == "unknown": + return [] + else: + return [name.strip() for name in model.split(",")] + + def _parse_is_favorite(self, page): + fav_button = text.extr( + page, 'id="favorite-button">', "") + unfav_button = text.extr( + page, 'class="actionbutton unfavorite-button">', "") + + is_favorite = None + if unfav_button == "Unfavorite": + is_favorite = True + if fav_button == "Favorite": + is_favorite = False + + return is_favorite + + def _extract_comments(self, page): + comments = text.extract_iter( + page, '
', "
") + return [comment.strip() for comment in comments] + + +class GirlswithmuscleSearchExtractor(GirlswithmuscleExtractor): + """Extractor for search results on girlswithmuscle.com""" + subcategory = "search" + pattern = BASE_PATTERN + r"/images/(.*)" + example = "https://www.girlswithmuscle.com/images/?name=MODEL" + + def pages(self): + query = self.groups[0] + url = f"{self.root}/images/{query}" + response = self.request(url) + if response.history: + msg = f'Request was redirected to "{response.url}", try logging in' + raise exception.AuthorizationError(msg) + page = response.text + + match = util.re(r"Page (\d+) of (\d+)").search(page) + current, total = match.groups() + current, total = text.parse_int(current), text.parse_int(total) + + yield page + for i in range(current + 1, total + 1): + url = f"{self.root}/images/{i}/{query}" + yield self.request(url).text + + def items(self): + self.login() + for page in self.pages(): + data = { + "_extractor" : GirlswithmusclePostExtractor, + "gallery_name": text.unescape(text.extr(page, "", "<")), + } + for imgid in text.extract_iter(page, 'id="imgid-', '"'): + url = f"{self.root}/{imgid}/" + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index ef9ea60..0a6c9b9 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -23,7 +23,7 @@ class GofileFolderExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.content_id = match.group(1) + self.content_id = match[1] def items(self): recursive = self.config("recursive") @@ -86,17 +86,16 @@ class GofileFolderExtractor(Extractor): return self._api_request("contents/" + content_id, params, headers) def _api_request(self, endpoint, params=None, headers=None, method="GET"): - response = self.request( + response = self.request_json( "https://api.gofile.io/" + endpoint, - method=method, params=params, headers=headers, - ).json() + method=method, params=params, headers=headers) if response["status"] != "ok": if response["status"] == "error-notFound": raise exception.NotFoundError("content") if response["status"] == "error-passwordRequired": raise exception.AuthorizationError("Password required") - raise exception.StopExtraction( - "%s failed (Status: %s)", endpoint, response["status"]) + raise exception.AbortExtraction( + f"{endpoint} failed (Status: {response['status']})") return response["data"] diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py index 792f666..8e350d6 100644 --- a/gallery_dl/extractor/hatenablog.py +++ b/gallery_dl/extractor/hatenablog.py @@ -6,9 +6,8 @@ """Extractors for https://hatenablog.com""" -import re from .common import Extractor, Message -from .. import text +from .. import text, util BASE_PATTERN = ( @@ -28,10 +27,10 @@ class HatenablogExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.domain = match.group(1) or match.group(2) + self.domain = match[1] or match[2] def _init(self): - self._find_img = re.compile(r'<img +([^>]+)').finditer + self._find_img = util.re(r'<img +([^>]+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) @@ -43,8 +42,8 @@ class HatenablogExtractor(Extractor): '<div class="entry-content hatenablog-entry">', '</div>') images = [] - for i in self._find_img(content): - attributes = i.group(1) + for match in self._find_img(content): + attributes = match[1] if 'class="hatena-fotolife"' not in attributes: continue image = text.unescape(text.extr(attributes, 'src="', '"')) @@ -68,13 +67,13 @@ class HatenablogEntriesExtractor(HatenablogExtractor): def __init__(self, match): HatenablogExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match[3] self.query = {key: value for key, value in text.parse_query( - match.group(4)).items() if self._acceptable_query(key)} + match[4]).items() if self._acceptable_query(key)} def _init(self): HatenablogExtractor._init(self) - self._find_pager_url = re.compile( + self._find_pager_url = util.re( r' class="pager-next">\s*<a href="([^"]+)').search def items(self): @@ -92,7 +91,7 @@ class HatenablogEntriesExtractor(HatenablogExtractor): yield from self._handle_full_articles(extr) match = self._find_pager_url(page) - url = text.unescape(match.group(1)) if match else None + url = text.unescape(match[1]) if match else None query = None def _handle_partial_articles(self, extr): @@ -129,7 +128,7 @@ class HatenablogEntryExtractor(HatenablogExtractor): def __init__(self, match): HatenablogExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match[3] def items(self): url = "https://" + self.domain + "/entry/" + self.path diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 1317ce9..ac4cd02 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, util -import re class Hentai2readBase(): @@ -31,8 +30,9 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) chapter, sep, minor = self.groups[1].partition(".") - match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " - r"([^:]+): (.+) . Page 1 ", title) + match = util.re( + r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " + r"([^:]+): (.+) . Page 1 ").match(title) if match: manga, type, author, _, title = match.groups() else: diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 4992b7b..5c2628f 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -44,10 +44,10 @@ class HentaicosplaysGalleryExtractor( def __init__(self, match): BaseExtractor.__init__(self, match) self.slug = self.groups[-1] - self.gallery_url = "{}/story/{}/".format(self.root, self.slug) + self.page_url = f"{self.root}/story/{self.slug}/" def _init(self): - self.session.headers["Referer"] = self.gallery_url + self.session.headers["Referer"] = self.page_url def metadata(self, page): title = text.extr(page, "<title>", "") diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 7e128a4..e529940 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,7 @@ """Extractors for https://www.hentai-foundry.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, util BASE_PATTERN = r"(https?://)?(?:www\.)?hentai-foundry\.com" @@ -25,8 +25,8 @@ class HentaifoundryExtractor(Extractor): per_page = 25 def __init__(self, match): - self.root = (match.group(1) or "https://") + "www.hentai-foundry.com" - self.user = match.group(2) + self.root = (match[1] or "https://") + "www.hentai-foundry.com" + self.user = match[2] Extractor.__init__(self, match) self.page_url = "" self.start_post = 0 @@ -58,7 +58,7 @@ class HentaifoundryExtractor(Extractor): num = self.start_page while True: - page = self.request("{}/page/{}".format(url, num)).text + page = self.request(f"{url}/page/{num}").text yield from text.extract_iter(page, begin, end) if 'class="pager"' not in page or 'class="last hidden"' in page: @@ -192,15 +192,11 @@ class HentaifoundryExtractor(Extractor): self.request(url, method="POST", data=data) -class HentaifoundryUserExtractor(HentaifoundryExtractor): +class HentaifoundryUserExtractor(Dispatch, HentaifoundryExtractor): """Extractor for a hentaifoundry user profile""" - subcategory = "user" pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" example = "https://www.hentai-foundry.com/user/USER/profile" - def initialize(self): - pass - def items(self): root = self.root user = "/user/" + self.user @@ -224,7 +220,7 @@ class HentaifoundryPicturesExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/user/{}".format(self.root, self.user) + self.page_url = f"{self.root}/pictures/user/{self.user}" class HentaifoundryScrapsExtractor(HentaifoundryExtractor): @@ -236,8 +232,7 @@ class HentaifoundryScrapsExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/user/{}/scraps".format( - self.root, self.user) + self.page_url = f"{self.root}/pictures/user/{self.user}/scraps" class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): @@ -250,8 +245,7 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/user/{}/faves/pictures".format( - self.root, self.user) + self.page_url = f"{self.root}/user/{self.user}/faves/pictures" class HentaifoundryTagExtractor(HentaifoundryExtractor): @@ -264,7 +258,7 @@ class HentaifoundryTagExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/tagged/{}".format(self.root, self.user) + self.page_url = f"{self.root}/pictures/tagged/{self.user}" def metadata(self): return {"search_tags": self.user} @@ -280,7 +274,7 @@ class HentaifoundryRecentExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.page_url = "{}/pictures/recent/{}".format(self.root, self.user) + self.page_url = f"{self.root}/pictures/recent/{self.user}" def metadata(self): return {"date": self.user} @@ -310,11 +304,11 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(3) + self.index = match[3] def items(self): - post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( - self.root, self.user, self.index) + post_url = (f"{self.root}/pictures/user/{self.user}" + f"/{self.index}/?enterAgree=1") image = self._parse_post(post_url) image["user"] = self.user yield Message.Directory, image @@ -336,7 +330,7 @@ class HentaifoundryStoriesExtractor(HentaifoundryExtractor): yield Message.Url, story["src"], story def stories(self): - url = "{}/stories/user/{}".format(self.root, self.user) + url = f"{self.root}/stories/user/{self.user}" return self._pagination(url, '
', '') @@ -351,11 +345,11 @@ class HentaifoundryStoryExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match) - self.index = match.group(3) + self.index = match[3] def items(self): - story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( - self.root, self.user, self.index) + story_url = (f"{self.root}/stories/user/{self.user}" + f"/{self.index}/x?enterAgree=1") story = self._parse_story(self.request(story_url).text) yield Message.Directory, story yield Message.Url, story["src"], story diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py index f3f43c4..f4f9d86 100644 --- a/gallery_dl/extractor/hentaihand.py +++ b/gallery_dl/extractor/hentaihand.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,8 @@ class HentaihandGalleryExtractor(GalleryExtractor): example = "https://hentaihand.com/en/comic/TITLE" def __init__(self, match): - self.slug = match.group(1) - url = "{}/api/comics/{}".format(self.root, self.slug) + self.slug = match[1] + url = f"{self.root}/api/comics/{self.slug}" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -44,7 +44,7 @@ class HentaihandGalleryExtractor(GalleryExtractor): return data def images(self, _): - info = self.request(self.gallery_url + "/images").json() + info = self.request_json(self.page_url + "/images") return [(img["source_url"], img) for img in info["images"]] @@ -68,8 +68,8 @@ class HentaihandTagExtractor(Extractor): else: tpl = self.type + "s" - url = "{}/api/{}/{}".format(self.root, tpl, self.key) - tid = self.request(url, notfound=self.type).json()["id"] + url = f"{self.root}/api/{tpl}/{self.key}" + tid = self.request_json(url, notfound=self.type)["id"] url = self.root + "/api/comics" params = { @@ -82,10 +82,10 @@ class HentaihandTagExtractor(Extractor): "duration": "day", } while True: - info = self.request(url, params=params).json() + info = self.request_json(url, params=params) for gallery in info["data"]: - gurl = "{}/en/comic/{}".format(self.root, gallery["slug"]) + gurl = f"{self.root}/en/comic/{gallery['slug']}" gallery["_extractor"] = HentaihandGalleryExtractor yield Message.Queue, gurl, gallery diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index ba9558c..b894d77 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor from .. import text, util -import re class HentaihereBase(): @@ -27,30 +26,30 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): def __init__(self, match): self.manga_id, self.chapter = match.groups() - url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter) + url = f"{self.root}/m/S{self.manga_id}/{self.chapter}/1" ChapterExtractor.__init__(self, match, url) def metadata(self, page): title = text.extr(page, "", "") chapter_id = text.extr(page, 'report/C', '"') chapter, sep, minor = self.chapter.partition(".") - pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " - match = re.match(pattern, title) + match = util.re( + r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by " + r"(.+) at ").match(title) return { - "manga": match.group(1), + "manga": match[1], "manga_id": text.parse_int(self.manga_id), "chapter": text.parse_int(chapter), "chapter_minor": sep + minor, "chapter_id": text.parse_int(chapter_id), - "type": match.group(2), - "title": match.group(3), - "author": match.group(4), + "type": match[2], + "title": match[3], + "author": match[4], "lang": "en", "language": "English", } - @staticmethod - def images(page): + def images(self, page): images = text.extr(page, "var rff_imageList = ", ";") return [ ("https://hentaicdn.com/hentai" + part, None) @@ -73,7 +72,7 @@ class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): mtype, pos = text.extract( page, '[', ']', pos) manga_id = text.parse_int( - self.manga_url.rstrip("/").rpartition("/")[2][1:]) + self.page_url.rstrip("/").rpartition("/")[2][1:]) while True: marker, pos = text.extract( diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 286ee38..d3901ac 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2024 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,8 +22,8 @@ class HentainexusGalleryExtractor(GalleryExtractor): example = "https://hentainexus.com/view/12345" def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/view/{}".format(self.root, self.gallery_id) + self.gallery_id = match[1] + url = f"{self.root}/view/{self.gallery_id}" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -59,7 +59,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): return data def images(self, _): - url = "{}/read/{}".format(self.root, self.gallery_id) + url = f"{self.root}/read/{self.gallery_id}" page = self.request(url).text imgs = util.json_loads(self._decode(text.extr( page, 'initReader("', '"'))) @@ -78,8 +78,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): pass return results - @staticmethod - def _decode(data): + def _decode(self, data): # https://hentainexus.com/static/js/reader.min.js?r=22 hostname = "hentainexus.com" primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53) @@ -118,8 +117,7 @@ class HentainexusGalleryExtractor(GalleryExtractor): return result - @staticmethod - def _join_title(data): + def _join_title(self, data): event = data['event'] artist = data['artist'] circle = data['circle'] @@ -137,18 +135,18 @@ class HentainexusGalleryExtractor(GalleryExtractor): jt = '' if event: - jt += '({}) '.format(event) + jt += f'({event}) ' if circle: - jt += '[{} ({})] '.format(circle, artist) + jt += f'[{circle} ({artist})] ' else: - jt += '[{}] '.format(artist) + jt += f'[{artist}] ' jt += title if parody.lower() != 'original work': - jt += ' ({})'.format(parody) + jt += f' ({parody})' if book: - jt += ' ({})'.format(book) + jt += f' ({book})' if magazine: - jt += ' ({})'.format(magazine) + jt += f' ({magazine})' return jt diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index f15aab7..a75eee0 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,9 +9,8 @@ """Extractors for https://hiperdex.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text +from .. import text, util from ..cache import memcache -import re BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))") @@ -25,7 +24,7 @@ class HiperdexBase(): @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/manga/{}/".format(self.root, manga) + url = f"{self.root}/manga/{manga}/" page = self.request(url).text extr = text.extract_from(page) @@ -80,10 +79,10 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): return self.chapter_data(self.chapter) def images(self, page): + pattern = util.re(r'id="image-\d+"\s+(?:data-)?src="([^"]+)') return [ (url.strip(), None) - for url in re.findall( - r'id="image-\d+"\s+(?:data-)?src="([^"]+)', page) + for url in pattern.findall(page) ] @@ -100,14 +99,14 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): def chapters(self, page): data = self.manga_data(self.manga, page) - self.manga_url = url = data["url"] + self.page_url = url = data["url"] - url = self.manga_url + "ajax/chapters/" + url = self.page_url + "ajax/chapters/" headers = { "Accept": "*/*", "X-Requested-With": "XMLHttpRequest", "Origin": self.root, - "Referer": "https://" + text.quote(self.manga_url[8:]), + "Referer": "https://" + text.quote(self.page_url[8:]), } html = self.request(url, method="POST", headers=headers).text @@ -130,8 +129,8 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): example = "https://hiperdex.com/manga-artist/NAME/" def __init__(self, match): - self.root = text.ensure_http_scheme(match.group(1)) - MangaExtractor.__init__(self, match, self.root + match.group(2) + "/") + self.root = text.ensure_http_scheme(match[1]) + MangaExtractor.__init__(self, match, self.root + match[2] + "/") def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 086b77c..82bed80 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,7 +13,6 @@ from .nozomi import decode_nozomi from ..cache import memcache from .. import text, util import string -import re class HitomiExtractor(Extractor): @@ -22,6 +21,27 @@ class HitomiExtractor(Extractor): root = "https://hitomi.la" domain = "gold-usergeneratedcontent.net" + def load_nozomi(self, query, language="all", headers=None): + ns, _, tag = query.strip().partition(":") + + if ns == "female" or ns == "male": + ns = "tag/" + tag = query + elif ns == "language": + ns = "" + language = tag + tag = "index" + else: + ns = f"{ns}/" + + url = (f"https://ltn.{self.domain}/n/{ns}" + f"/{tag.replace('_', ' ')}-{language}.nozomi") + if headers is None: + headers = {} + headers["Origin"] = self.root + headers["Referer"] = f"{self.root}/" + return decode_nozomi(self.request(url, headers=headers).content) + class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): """Extractor for hitomi.la galleries""" @@ -33,23 +53,19 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): def __init__(self, match): GalleryExtractor.__init__(self, match, False) self.gid = gid = self.groups[0] - self.gallery_url = "https://ltn.{}/galleries/{}.js".format( - self.domain, gid) + self.page_url = f"https://ltn.{self.domain}/galleries/{gid}.js" def _init(self): - self.session.headers["Referer"] = "{}/reader/{}.html".format( - self.root, self.gid) + self.session.headers["Referer"] = f"{self.root}/reader/{self.gid}.html" def metadata(self, page): self.info = info = util.json_loads(page.partition("=")[2]) iget = info.get - language = iget("language") - if language: + if language := iget("language"): language = language.capitalize() - date = iget("date") - if date: + if date := iget("date"): date += ":00" tags = [] @@ -83,7 +99,7 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): fmt = ext = self.config("format") or "webp" check = (fmt != "webp") - result = [] + results = [] for image in self.info["files"]: if check: ext = fmt if image.get("has" + fmt) else "webp" @@ -94,12 +110,10 @@ class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): # https://ltn.gold-usergeneratedcontent.net/common.js inum = int(ihash[-1] + ihash[-3:-1], 16) - url = "https://{}{}.{}/{}/{}/{}.{}".format( - ext[0], gg_m.get(inum, gg_default) + 1, self.domain, - gg_b, inum, ihash, ext, - ) - result.append((url, idata)) - return result + url = (f"https://{ext[0]}{gg_m.get(inum, gg_default) + 1}." + f"{self.domain}/{gg_b}/{inum}/{ihash}.{ext}") + results.append((url, idata)) + return results class HitomiTagExtractor(HitomiExtractor): @@ -123,8 +137,7 @@ class HitomiTagExtractor(HitomiExtractor): "_extractor": HitomiGalleryExtractor, "search_tags": text.unquote(self.tag.rpartition("-")[0]), } - nozomi_url = "https://ltn.{}/{}/{}.nozomi".format( - self.domain, self.type, self.tag) + nozomi_url = f"https://ltn.{self.domain}/{self.type}/{self.tag}.nozomi" headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -133,14 +146,13 @@ class HitomiTagExtractor(HitomiExtractor): offset = 0 total = None while True: - headers["Referer"] = "{}/{}/{}.html?page={}".format( - self.root, self.type, self.tag, offset // 100 + 1) - headers["Range"] = "bytes={}-{}".format(offset, offset+99) + headers["Referer"] = (f"{self.root}/{self.type}/{self.tag}.html" + f"?page={offset // 100 + 1}") + headers["Range"] = f"bytes={offset}-{offset + 99}" response = self.request(nozomi_url, headers=headers) for gallery_id in decode_nozomi(response.content): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) + gallery_url = f"{self.root}/galleries/{gallery_id}.html" yield Message.Queue, gallery_url, data offset += 100 @@ -163,8 +175,8 @@ class HitomiIndexExtractor(HitomiTagExtractor): def items(self): data = {"_extractor": HitomiGalleryExtractor} - nozomi_url = "https://ltn.{}/{}-{}.nozomi".format( - self.domain, self.tag, self.language) + nozomi_url = (f"https://ltn.{self.domain}" + f"/{self.tag}-{self.language}.nozomi") headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -173,14 +185,13 @@ class HitomiIndexExtractor(HitomiTagExtractor): offset = 0 total = None while True: - headers["Referer"] = "{}/{}-{}.html?page={}".format( - self.root, self.tag, self.language, offset // 100 + 1) - headers["Range"] = "bytes={}-{}".format(offset, offset+99) + headers["Referer"] = (f"{self.root}/{self.tag}-{self.language}" + f".html?page={offset // 100 + 1}") + headers["Range"] = f"bytes={offset}-{offset + 99}" response = self.request(nozomi_url, headers=headers) for gallery_id in decode_nozomi(response.content): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) + gallery_url = f"{self.root}/galleries/{gallery_id}.html" yield Message.Queue, gallery_url, data offset += 100 @@ -194,60 +205,46 @@ class HitomiIndexExtractor(HitomiTagExtractor): class HitomiSearchExtractor(HitomiExtractor): """Extractor for galleries from multiple tag searches on hitomi.la""" subcategory = "search" - pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" + pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^#]+)" example = "https://hitomi.la/search.html?QUERY" - def __init__(self, match): - Extractor.__init__(self, match) - self.query = match.group(1) - self.tags = text.unquote(self.query) - def items(self): + tags = text.unquote(self.groups[0]) + data = { "_extractor": HitomiGalleryExtractor, - "search_tags": self.tags, + "search_tags": tags, } - results = [self.get_nozomi_items(tag) for tag in self.tags.split(" ")] - intersects = set.intersection(*results) - for gallery_id in sorted(intersects, reverse=True): - gallery_url = "{}/galleries/{}.html".format( - self.root, gallery_id) + for gallery_id in self.gallery_ids(tags): + gallery_url = f"{self.root}/galleries/{gallery_id}.html" yield Message.Queue, gallery_url, data - def get_nozomi_items(self, full_tag): - area, tag, language = self.get_nozomi_args(full_tag) + def gallery_ids(self, tags): + result = None + positive = [] + negative = [] - if area: - nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format( - self.domain, area, tag, language) - else: - nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format( - self.domain, tag, language) + for tag in tags.split(): + if tag[0] == "-": + negative.append(tag[1:]) + else: + positive.append(tag) - headers = { - "Origin": self.root, - "Cache-Control": "max-age=0", - "Referer": "{}/search.html?{}".format(self.root, self.query), - } - - response = self.request(nozomi_url, headers=headers) - return set(decode_nozomi(response.content)) + for tag in positive: + ids = self.load_nozomi(tag) + if result is None: + result = set(ids) + else: + result.intersection_update(ids) - def get_nozomi_args(self, query): - ns, _, tag = query.strip().partition(":") - area = ns - language = "all" - - if ns == "female" or ns == "male": - area = "tag" - tag = query - elif ns == "language": - area = None - language = tag - tag = "index" + if result is None: + # result = set(self.load_nozomi("index")) + result = set(self.load_nozomi("language:all")) + for tag in negative: + result.difference_update(self.load_nozomi(tag)) - return area, tag.replace("_", " "), language + return sorted(result, reverse=True) if result else () @memcache(maxage=1800) @@ -257,8 +254,8 @@ def _parse_gg(extr): m = {} keys = [] - for match in re.finditer( - r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page): + for match in util.re_compile( + r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?").finditer(page): key, value = match.groups() keys.append(int(key)) @@ -268,11 +265,11 @@ def _parse_gg(extr): m[key] = value keys.clear() - for match in re.finditer( - r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page): - m[int(match.group(1))] = int(match.group(2)) + for match in util.re_compile( + r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)").finditer(page): + m[int(match[1])] = int(match[2]) - d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) - b = re.search(r"b:\s*[\"'](.+)[\"']", page) + d = util.re_compile(r"(?:var\s|default:)\s*o\s*=\s*(\d+)").search(page) + b = util.re_compile(r"b:\s*[\"'](.+)[\"']").search(page) - return m, b.group(1).strip("/"), int(d.group(1)) if d else 0 + return m, b[1].strip("/"), int(d[1]) if d else 0 diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index ddfc54b..587d88c 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -70,8 +70,7 @@ class HotleakPostExtractor(HotleakExtractor): self.creator, self.type, self.id = match.groups() def posts(self): - url = "{}/{}/{}/{}".format( - self.root, self.creator, self.type, self.id) + url = f"{self.root}/{self.creator}/{self.type}/{self.id}" page = self.request(url).text page = text.extr( page, '
', '') @@ -103,10 +102,10 @@ class HotleakCreatorExtractor(HotleakExtractor): def __init__(self, match): HotleakExtractor.__init__(self, match) - self.creator = match.group(1) + self.creator = match[1] def posts(self): - url = "{}/{}".format(self.root, self.creator) + url = f"{self.root}/{self.creator}" return self._pagination(url) def _pagination(self, url): @@ -159,7 +158,7 @@ class HotleakCategoryExtractor(HotleakExtractor): self._category, self.params = match.groups() def items(self): - url = "{}/{}".format(self.root, self._category) + url = f"{self.root}/{self._category}" if self._category in ("hot", "creators"): data = {"_extractor": HotleakCreatorExtractor} @@ -178,7 +177,7 @@ class HotleakSearchExtractor(HotleakExtractor): def __init__(self, match): HotleakExtractor.__init__(self, match) - self.params = match.group(1) + self.params = match[1] def items(self): data = {"_extractor": HotleakCreatorExtractor} diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 8f4a10c..075e1f6 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2023 Mike Fährmann +# Copyright 2018-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -108,8 +108,7 @@ class IdolcomplexExtractor(SankakuExtractor): pid = extr(">Post ID:", "<") created = extr(' title="', '"') - file_url = extr('>Original:', 'id=') - if file_url: + if file_url := extr('>Original:', 'id='): file_url = extr(' href="', '"') width = extr(">", "x") height = extr("", " ") @@ -159,7 +158,7 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): def __init__(self, match): IdolcomplexExtractor.__init__(self, match) - query = text.parse_query(match.group(1)) + query = text.parse_query(match[1]) self.tags = text.unquote(query.get("tags", "").replace("+", " ")) self.start_page = text.parse_int(query.get("page"), 1) self.next = text.parse_int(query.get("next"), 0) @@ -184,7 +183,7 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): tags = self.tags.split() if not self.logged_in and len(tags) > 4: - raise exception.StopExtraction( + raise exception.AbortExtraction( "Non-members can only search up to 4 tags at once") return {"search_tags": " ".join(tags)} diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 68360e9..171feea 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://www.imagebam.com/""" from .common import Extractor, Message -from .. import text -import re +from .. import text, util class ImagebamExtractor(Extractor): @@ -20,7 +19,7 @@ class ImagebamExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.path = match.group(1) + self.path = match[1] def _init(self): self.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") @@ -64,22 +63,19 @@ class ImagebamGalleryExtractor(ImagebamExtractor): image.update(data) yield Message.Url, image["url"], image - @staticmethod - def metadata(page): + def metadata(self, page): return {"title": text.unescape(text.extr( page, 'id="gallery-name">', '<').strip())} def images(self, page): - findall = re.compile(r'")[2].split()) - raise exception.StopExtraction("'%s'", msg) + raise exception.AbortExtraction(f"'{msg}'") self.log.warning("HTTP redirect to %s", response.url) return response @@ -45,11 +44,11 @@ class ImagefapGalleryExtractor(ImagefapExtractor): def __init__(self, match): ImagefapExtractor.__init__(self, match) - self.gid = match.group(1) + self.gid = match[1] self.image_id = "" def items(self): - url = "{}/gallery/{}".format(self.root, self.gid) + url = f"{self.root}/gallery/{self.gid}" page = self.request(url).text data = self.get_job_metadata(page) yield Message.Directory, data @@ -81,12 +80,12 @@ class ImagefapGalleryExtractor(ImagefapExtractor): def get_images(self): """Collect image-urls and -metadata""" - url = "{}/photo/{}/".format(self.root, self.image_id) + url = f"{self.root}/photo/{self.image_id}/" params = {"gid": self.gid, "idx": 0, "partial": "true"} headers = { "Content-Type": "application/x-www-form-urlencoded", "X-Requested-With": "XMLHttpRequest", - "Referer": "{}?pgid=&gid={}&page=0".format(url, self.image_id) + "Referer": f"{url}?pgid=&gid={self.image_id}&page=0" } num = 0 @@ -116,7 +115,7 @@ class ImagefapImageExtractor(ImagefapExtractor): def __init__(self, match): ImagefapExtractor.__init__(self, match) - self.image_id = match.group(1) + self.image_id = match[1] def items(self): url, data = self.get_image() @@ -124,7 +123,7 @@ class ImagefapImageExtractor(ImagefapExtractor): yield Message.Url, url, data def get_image(self): - url = "{}/photo/{}/".format(self.root, self.image_id) + url = f"{self.root}/photo/{self.image_id}/" page = self.request(url).text url, pos = text.extract( @@ -161,7 +160,7 @@ class ImagefapFolderExtractor(ImagefapExtractor): def items(self): for gallery_id, name, folder in self.galleries(self.folder_id): - url = "{}/gallery/{}".format(self.root, gallery_id) + url = f"{self.root}/gallery/{gallery_id}" data = { "gallery_id": gallery_id, "title" : text.unescape(name), @@ -175,14 +174,13 @@ class ImagefapFolderExtractor(ImagefapExtractor): if folder_id == "-1": folder_name = "Uncategorized" if self._id: - url = "{}/usergallery.php?userid={}&folderid=-1".format( - self.root, self.user) + url = (f"{self.root}/usergallery.php" + f"?userid={self.user}&folderid=-1") else: - url = "{}/profile/{}/galleries?folderid=-1".format( - self.root, self.user) + url = f"{self.root}/profile/{self.user}/galleries?folderid=-1" else: folder_name = None - url = "{}/organizer/{}/".format(self.root, folder_id) + url = f"{self.root}/organizer/{folder_id}/" params = {"page": 0} extr = text.extract_from(self.request(url, params=params).text) @@ -222,19 +220,17 @@ class ImagefapUserExtractor(ImagefapExtractor): for folder_id in self.folders(): if folder_id == "-1": - url = "{}/profile/{}/galleries?folderid=-1".format( - self.root, self.user) + url = f"{self.root}/profile/{self.user}/galleries?folderid=-1" else: - url = "{}/organizer/{}/".format(self.root, folder_id) + url = f"{self.root}/organizer/{folder_id}/" yield Message.Queue, url, data def folders(self): """Return a list of folder IDs of a user""" if self.user: - url = "{}/profile/{}/galleries".format(self.root, self.user) + url = f"{self.root}/profile/{self.user}/galleries" else: - url = "{}/usergallery.php?userid={}".format( - self.root, self.user_id) + url = f"{self.root}/usergallery.php?userid={self.user_id}" response = self.request(url) self.user = response.url.split("/")[-2] diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d6b36cb..0e5ce7e 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2023 Mike Fährmann +# Copyright 2016-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,12 +23,12 @@ class ImagehostImageExtractor(Extractor): _params = None _cookies = None _encoding = None + _validate = None def __init__(self, match): Extractor.__init__(self, match) - self.page_url = "http{}://{}".format( - "s" if self._https else "", match.group(1)) - self.token = match.group(2) + self.page_url = f"http{'s' if self._https else ''}://{match[1]}" + self.token = match[2] if self._params == "simple": self._params = { @@ -57,6 +57,8 @@ class ImagehostImageExtractor(Extractor): data.update(self.metadata(page)) if self._https and url.startswith("http:"): url = "https:" + url[5:] + if self._validate is not None: + data["_http_validate"] = self._validate yield Message.Directory, data yield Message.Url, url, data @@ -164,6 +166,14 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): filename, pos = text.extract(page, 'alt="', '"', pos) return url, text.unescape(filename) + def _validate(self, response): + hget = response.headers.get + return not ( + hget("content-length") == "14396" and + hget("content-type") == "image/jpeg" and + hget("last-modified") == "Mon, 04 May 2020 07:19:52 GMT" + ) + class ImagetwistImageExtractor(ImagehostImageExtractor): """Extractor for single images from imagetwist.com""" @@ -200,6 +210,26 @@ class ImagetwistGalleryExtractor(ImagehostImageExtractor): yield Message.Queue, root + path, data +class ImgadultImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imgadult.com""" + category = "imgadult" + _cookies = {"img_i_d": "1"} + pattern = r"(?:https?://)?((?:www\.)?imgadult\.com/img-([0-9a-f]+)\.html)" + example = "https://imgadult.com/img-0123456789abc.html" + + def get_info(self, page): + url , pos = text.extract(page, "' src='", "'") + name, pos = text.extract(page, "alt='", "'", pos) + + if name: + name, _, rhs = name.rpartition(" image hosted at ImgAdult.com") + if not name: + name = rhs + name = text.unescape(name) + + return url, name + + class ImgspiceImageExtractor(ImagehostImageExtractor): """Extractor for single images from imgspice.com""" category = "imgspice" diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index b926cb2..e6abdeb 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2023 Mike Fährmann +# Copyright 2019-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -112,7 +112,7 @@ class ImgbbExtractor(Extractor): params["page"] += 1 elif not seek or 'class="pagination-next"' not in page: return - data = self.request(endpoint, method="POST", data=params).json() + data = self.request_json(endpoint, method="POST", data=params) page = data["html"] @@ -126,8 +126,8 @@ class ImgbbAlbumExtractor(ImgbbExtractor): def __init__(self, match): ImgbbExtractor.__init__(self, match) self.album_name = None - self.album_id = match.group(1) - self.sort = text.parse_query(match.group(2)).get("sort", "date_desc") + self.album_id = match[1] + self.sort = text.parse_query(match[2]).get("sort", "date_desc") self.page_url = "https://ibb.co/album/" + self.album_id def metadata(self, page): @@ -162,9 +162,9 @@ class ImgbbUserExtractor(ImgbbExtractor): def __init__(self, match): ImgbbExtractor.__init__(self, match) - self.user = match.group(1) - self.sort = text.parse_query(match.group(2)).get("sort", "date_desc") - self.page_url = "https://{}.imgbb.com/".format(self.user) + self.user = match[1] + self.sort = text.parse_query(match[2]).get("sort", "date_desc") + self.page_url = f"https://{self.user}.imgbb.com/" def metadata(self, page): user = self._extract_user(page) @@ -191,7 +191,7 @@ class ImgbbImageExtractor(ImgbbExtractor): def __init__(self, match): ImgbbExtractor.__init__(self, match) - self.image_id = match.group(1) + self.image_id = match[1] def items(self): url = "https://ibb.co/" + self.image_id diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index 7069717..5def88d 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2023 Mike Fährmann +# Copyright 2014-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,8 +9,7 @@ """Extractors for https://imgbox.com/""" from .common import Extractor, Message, AsynchronousMixin -from .. import text, exception -import re +from .. import text, util, exception class ImgboxExtractor(Extractor): @@ -31,18 +30,15 @@ class ImgboxExtractor(Extractor): text.nameext_from_url(imgdata["filename"], imgdata) yield Message.Url, self.get_image_url(imgpage), imgdata - @staticmethod - def get_job_metadata(): + def get_job_metadata(self): """Collect metadata for extractor-job""" return {} - @staticmethod - def get_image_keys(): + def get_image_keys(self): """Return an iterable containing all image-keys""" return [] - @staticmethod - def get_image_metadata(page): + def get_image_metadata(self, page): """Collect metadata for a downloadable file""" return text.extract_all(page, ( ("num" , '   ', ' of '), @@ -50,8 +46,7 @@ class ImgboxExtractor(Extractor): ("filename" , ' title="', '"'), ))[0] - @staticmethod - def get_image_url(page): + def get_image_url(self, page): """Extract download-url""" return text.extr(page, 'property="og:image" content="', '"') @@ -67,14 +62,15 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor): def __init__(self, match): ImgboxExtractor.__init__(self, match) - self.gallery_key = match.group(1) + self.gallery_key = match[1] self.image_keys = [] def get_job_metadata(self): page = self.request(self.root + "/g/" + self.gallery_key).text if "The specified gallery could not be found." in page: raise exception.NotFoundError("gallery") - self.image_keys = re.findall(r'', page)
+        self.image_keys = util.re(
+            r'<a href=').findall(page)
 
         title = text.extr(page, ", "

") title, _, count = title.rpartition(" - ") @@ -97,14 +93,13 @@ class ImgboxImageExtractor(ImgboxExtractor): def __init__(self, match): ImgboxExtractor.__init__(self, match) - self.image_key = match.group(1) + self.image_key = match[1] def get_image_keys(self): return (self.image_key,) - @staticmethod - def get_image_metadata(page): - data = ImgboxExtractor.get_image_metadata(page) + def get_image_metadata(self, page): + data = ImgboxExtractor.get_image_metadata(self, page) if not data["filename"]: raise exception.NotFoundError("image") return data diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 3aa7922..7e5e6cf 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,8 +20,8 @@ class ImgthGalleryExtractor(GalleryExtractor): example = "https://imgth.com/gallery/123/TITLE" def __init__(self, match): - self.gallery_id = gid = match.group(1) - url = "{}/gallery/{}/g/".format(self.root, gid) + self.gallery_id = gid = match[1] + url = f"{self.root}/gallery/{gid}/g/" GalleryExtractor.__init__(self, match, url) def metadata(self, page): @@ -45,12 +45,11 @@ class ImgthGalleryExtractor(GalleryExtractor): thumbs = text.extr(page, '
    ', '
') for url in text.extract_iter(thumbs, '' not in page: return pnum += 1 - url = "{}/gallery/{}/g/page/{}".format( - self.root, self.gallery_id, pnum) + url = f"{self.root}/gallery/{self.gallery_id}/g/page/{pnum}" page = self.request(url).text diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 20f8ea4..1ac76e0 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2023 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,7 +21,7 @@ class ImgurExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.key = match.group(1) + self.key = match[1] def _init(self): self.api = ImgurAPI(self) @@ -36,8 +36,8 @@ class ImgurExtractor(Extractor): elif image["is_animated"] and self.mp4 and image["ext"] == "gif": image["ext"] = "mp4" - image["url"] = url = "https://i.imgur.com/{}.{}".format( - image["id"], image["ext"]) + image["url"] = url = \ + f"https://i.imgur.com/{image['id']}.{image['ext']}" image["date"] = text.parse_datetime(image["created_at"]) image["_http_validate"] = self._validate text.nameext_from_url(url, image) @@ -131,10 +131,10 @@ class ImgurGalleryExtractor(ImgurExtractor): def items(self): if self.api.gallery(self.key)["is_album"]: - url = "{}/a/{}".format(self.root, self.key) + url = f"{self.root}/a/{self.key}" extr = ImgurAlbumExtractor else: - url = "{}/{}".format(self.root, self.key) + url = f"{self.root}/{self.key}" extr = ImgurImageExtractor yield Message.Queue, url, {"_extractor": extr} @@ -168,7 +168,7 @@ class ImgurFavoriteFolderExtractor(ImgurExtractor): def __init__(self, match): ImgurExtractor.__init__(self, match) - self.folder_id = match.group(2) + self.folder_id = match[2] def items(self): return self._items_queue(self.api.account_favorites_folder( @@ -234,16 +234,15 @@ class ImgurAPI(): self.headers = {"Authorization": "Client-ID " + self.client_id} def account_submissions(self, account): - endpoint = "/3/account/{}/submissions".format(account) + endpoint = f"/3/account/{account}/submissions" return self._pagination(endpoint) def account_favorites(self, account): - endpoint = "/3/account/{}/gallery_favorites".format(account) + endpoint = f"/3/account/{account}/gallery_favorites" return self._pagination(endpoint) def account_favorites_folder(self, account, folder_id): - endpoint = "/3/account/{}/folders/{}/favorites".format( - account, folder_id) + endpoint = f"/3/account/{account}/folders/{folder_id}/favorites" return self._pagination_v2(endpoint) def accounts_me_allposts(self): @@ -270,11 +269,11 @@ class ImgurAPI(): return self._pagination(endpoint, params) def gallery_subreddit(self, subreddit): - endpoint = "/3/gallery/r/{}".format(subreddit) + endpoint = f"/3/gallery/r/{subreddit}" return self._pagination(endpoint) def gallery_tag(self, tag): - endpoint = "/3/gallery/t/{}".format(tag) + endpoint = f"/3/gallery/t/{tag}" return self._pagination(endpoint, key="items") def image(self, image_hash): @@ -294,10 +293,9 @@ class ImgurAPI(): def _call(self, endpoint, params=None, headers=None): while True: try: - return self.extractor.request( + return self.extractor.request_json( "https://api.imgur.com" + endpoint, - params=params, headers=(headers or self.headers), - ).json() + params=params, headers=(headers or self.headers)) except exception.HttpError as exc: if exc.status not in (403, 429) or \ b"capacity" not in exc.response.content: @@ -308,7 +306,7 @@ class ImgurAPI(): num = 0 while True: - data = self._call("{}/{}".format(endpoint, num), params)["data"] + data = self._call(f"{endpoint}/{num}", params)["data"] if key: data = data[key] if not data: diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index 1b0fba3..5ad1c30 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -38,7 +38,7 @@ class ImhentaiExtractor(BaseExtractor): yield Message.Queue, base + gallery_id, data prev = gallery_id - href = text.rextract(page, "class='page-link' href='", "'")[0] + href = text.rextr(page, "class='page-link' href='", "'") if not href or href == "#": return if href[0] == "/": @@ -85,7 +85,7 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def __init__(self, match): ImhentaiExtractor.__init__(self, match) self.gallery_id = self.groups[-1] - self.gallery_url = "{}/gallery/{}/".format(self.root, self.gallery_id) + self.page_url = f"{self.root}/gallery/{self.gallery_id}/" def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 47e071a..45ae52e 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -109,12 +109,11 @@ class InkbunnyPoolExtractor(InkbunnyExtractor): def __init__(self, match): InkbunnyExtractor.__init__(self, match) - pid = match.group(1) - if pid: + if pid := match[1]: self.pool_id = pid self.orderby = "pool_order" else: - params = text.parse_query(match.group(2)) + params = text.parse_query(match[2]) self.pool_id = params.get("pool_id") self.orderby = params.get("orderby", "pool_order") @@ -142,19 +141,18 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): def __init__(self, match): InkbunnyExtractor.__init__(self, match) - uid = match.group(1) - if uid: + if uid := match[1]: self.user_id = uid self.orderby = self.config("orderby", "fav_datetime") else: - params = text.parse_query(match.group(2)) + params = text.parse_query(match[2]) self.user_id = params.get("user_id") self.orderby = params.get("orderby", "fav_datetime") def metadata(self): # Lookup fav user ID as username - url = "{}/userfavorites_process.php?favs_user_id={}".format( - self.root, self.user_id) + url = (f"{self.root}/userfavorites_process.php" + f"?favs_user_id={self.user_id}") page = self.request(url).text user_link = text.extr(page, '

05}" + p2[4:] - return [(ufmt.format(num), None) for num in range(1, count + 1)] + p2 = p2[4:] + return [(f"{p1}/image{i:>05}{p2}", None) for i in range(1, count + 1)] def images_v2(self, page): + base = f"{self.root}/showimage/" results = [] while True: for path in text.extract_iter( page, ' class="picbox">