diff options
| author | 2025-01-28 19:12:16 -0500 | |
|---|---|---|
| committer | 2025-01-28 19:12:16 -0500 | |
| commit | 5a7d8217a6edc66e3cf25ca0eee6614a10fa866c (patch) | |
| tree | a941825bf5fcf706f23b49e536edb9a2b26d5b6c /gallery_dl/extractor | |
| parent | e8f1b0d968a07cba884462e10718628394d1bae5 (diff) | |
| parent | a26df18796ff4e506b16bf32fcec9336233b9e2e (diff) | |
Update upstream source from tag 'upstream/1.28.5'
Update to upstream version '1.28.5'
with Debian dir a2e4b8ba663c03c37256ad2b059b382999e473bc
Diffstat (limited to 'gallery_dl/extractor')
40 files changed, 606 insertions, 536 deletions
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index 948a605..d198369 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -64,7 +64,7 @@ class _4archiveThreadExtractor(Extractor): data = { "name": extr('class="name">', "</span>"), "date": text.parse_datetime( - extr('class="dateTime postNum">', "<").strip(), + extr('class="dateTime postNum" >', "<").strip(), "%Y-%m-%d %H:%M:%S"), "no" : text.parse_int(extr('href="#p', '"')), } diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b582c99..fc8d7b2 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -39,7 +39,6 @@ modules = [ "chevereto", "cien", "civitai", - "cohost", "comicvine", "cyberdrop", "danbooru", @@ -52,7 +51,6 @@ modules = [ "exhentai", "facebook", "fanbox", - "fanleaks", "fantia", "fapello", "fapachi", @@ -116,6 +114,7 @@ modules = [ "myportfolio", "naver", "naverwebtoon", + "nekohouse", "newgrounds", "nhentai", "nijie", @@ -196,6 +195,7 @@ modules = [ "wikiart", "wikifeet", "wikimedia", + "xfolio", "xhamster", "xvideos", "yiffverse", diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index 1617414..c891b17 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -24,6 +24,9 @@ class AdultempireGalleryExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match) self.gallery_id = match.group(2) + def _init(self): + self.cookies.set("ageConfirmed", "true", domain="www.adultempire.com") + def metadata(self, page): extr = text.extract_from(page, page.index('<div id="content">')) return { diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py index 8064e78..0268224 100644 --- a/gallery_dl/extractor/architizer.py +++ b/gallery_dl/extractor/architizer.py @@ -32,10 +32,10 @@ class ArchitizerProjectExtractor(GalleryExtractor): extr('id="Pages"', "") return { - "title" : extr('data-name="', '"'), - "slug" : extr('data-slug="', '"'), - "gid" : extr('data-gid="', '"').rpartition(".")[2], - "firm" : extr('data-firm-leaders-str="', '"'), + "title" : extr("data-name='", "'"), + "slug" : extr("data-slug='", "'"), + "gid" : extr("data-gid='", "'").rpartition(".")[2], + "firm" : extr("data-firm-leaders-str='", "'"), "location" : extr("<h2>", "<").strip(), "type" : text.unescape(text.remove_html(extr( '<div class="title">Type</div>', '<br'))), @@ -54,7 +54,7 @@ class ArchitizerProjectExtractor(GalleryExtractor): return [ (url, None) for url in text.extract_iter( - page, 'property="og:image:secure_url" content="', "?") + page, "property='og:image:secure_url' content='", "?") ] diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index ce1a78d..f448710 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -11,8 +11,6 @@ from .common import Extractor, Message from .. import text, util, exception import itertools -import random -import string class ArtstationExtractor(Extractor): @@ -29,6 +27,9 @@ class ArtstationExtractor(Extractor): Extractor.__init__(self, match) self.user = match.group(1) or match.group(2) + def _init(self): + self.session.headers["Cache-Control"] = "max-age=0" + def items(self): videos = self.config("videos", True) previews = self.config("previews", False) @@ -172,7 +173,7 @@ class ArtstationExtractor(Extractor): ).json()["public_csrf_token"] @staticmethod - def _no_cache(url, alphabet=(string.digits + string.ascii_letters)): + def _no_cache(url): """Cause a cache miss to prevent Cloudflare 'optimizations' Cloudflare's 'Polish' optimization strips image metadata and may even @@ -184,10 +185,9 @@ class ArtstationExtractor(Extractor): https://github.com/r888888888/danbooru/issues/3528 https://danbooru.donmai.us/forum_topics/14952 """ - param = "gallerydl_no_cache=" + util.bencode( - random.getrandbits(64), alphabet) sep = "&" if "?" in url else "?" - return url + sep + param + token = util.generate_token(8) + return url + sep + token[:4] + "=" + token[4:] class ArtstationUserExtractor(ArtstationExtractor): diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index 77c40ef..4d192a4 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -67,6 +67,7 @@ class BatotoBase(): class BatotoChapterExtractor(BatotoBase, ChapterExtractor): """Extractor for batoto manga chapters""" + archive_fmt = "{chapter_id}_{page}" pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)" example = "https://xbato.org/title/12345-MANGA/54321" diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index e1ee50d..25e9fd5 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -70,6 +70,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): self.root = "https://" + domain def request(self, url, **kwargs): + kwargs["encoding"] = "utf-8" kwargs["allow_redirects"] = False while True: @@ -114,8 +115,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): def fetch_album(self, album_id): # album metadata - page = self.request( - self.root + "/a/" + album_id, encoding="utf-8").text + page = self.request(self.root + "/a/" + album_id).text title = text.unescape(text.unescape(text.extr( page, 'property="og:title" content="', '"'))) @@ -140,7 +140,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): file = self._extract_file(url) info = text.split_html(item) - file["name"] = info[-3] + if not file["name"]: + file["name"] = info[-3] file["size"] = info[-2] file["date"] = text.parse_datetime( info[-1], "%H:%M:%S %d/%m/%Y") @@ -157,6 +158,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): page = response.text file_url = (text.extr(page, '<source src="', '"') or text.extr(page, '<img src="', '"')) + file_name = (text.extr(page, 'property="og:title" content="', '"') or + text.extr(page, "<title>", " | Bunkr<")) if not file_url: webpage_url = text.unescape(text.rextract( @@ -166,6 +169,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): return { "file" : text.unescape(file_url), + "name" : text.unescape(file_name), "_http_headers" : {"Referer": response.url}, "_http_validate": self._validate, } diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py deleted file mode 100644 index 6a43224..0000000 --- a/gallery_dl/extractor/cohost.py +++ /dev/null @@ -1,250 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2024 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://cohost.org/""" - -from .common import Extractor, Message -from .. import text, util - -BASE_PATTERN = r"(?:https?://)?(?:www\.)?cohost\.org" - - -class CohostExtractor(Extractor): - """Base class for cohost extractors""" - category = "cohost" - root = "https://cohost.org" - directory_fmt = ("{category}", "{postingProject[handle]}") - filename_fmt = ("{postId}{headline:?_//[b:200]}{num:?_//}.{extension}") - archive_fmt = "{postId}_{num}" - - def _init(self): - self.replies = self.config("replies", True) - self.pinned = self.config("pinned", False) - self.shares = self.config("shares", False) - self.asks = self.config("asks", True) - - self.avatar = self.config("avatar", False) - if self.avatar: - self._urls_avatar = {None, ""} - - self.background = self.config("background", False) - if self.background: - self._urls_background = {None, ""} - - def items(self): - for post in self.posts(): - reason = post.get("limitedVisibilityReason") - if reason and reason != "none": - if reason == "log-in-first": - reason = ("This page's posts are visible only to users " - "who are logged in.") - self.log.warning('%s: "%s"', post["postId"], reason) - - files = self._extract_files(post) - post["count"] = len(files) - post["date"] = text.parse_datetime( - post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") - - yield Message.Directory, post - - project = post["postingProject"] - if self.avatar: - url = project.get("avatarURL") - if url not in self._urls_avatar: - self._urls_avatar.add(url) - p = post.copy() - p["postId"] = p["kind"] = "avatar" - p["headline"] = p["num"] = "" - yield Message.Url, url, text.nameext_from_url(url, p) - - if self.background: - url = project.get("headerURL") - if url not in self._urls_background: - self._urls_background.add(url) - p = post.copy() - p["postId"] = p["kind"] = "background" - p["headline"] = p["num"] = "" - yield Message.Url, url, text.nameext_from_url(url, p) - - for post["num"], file in enumerate(files, 1): - url = file["fileURL"] - post.update(file) - text.nameext_from_url(url, post) - yield Message.Url, url, post - - def posts(self): - return () - - def _request_api(self, endpoint, input): - url = "{}/api/v1/trpc/{}".format(self.root, endpoint) - params = {"batch": "1", "input": util.json_dumps({"0": input})} - headers = {"content-type": "application/json"} - - data = self.request(url, params=params, headers=headers).json() - return data[0]["result"]["data"] - - def _extract_files(self, post): - files = [] - - self._extract_blocks(post, files) - if self.shares and post.get("shareTree"): - for share in post["shareTree"]: - self._extract_blocks(share, files, share) - del post["shareTree"] - - return files - - def _extract_blocks(self, post, files, shared=None): - post["content"] = content = [] - - for block in post.pop("blocks") or (): - try: - type = block["type"] - if type == "attachment": - file = block["attachment"].copy() - file["shared"] = shared - files.append(file) - elif type == "attachment-row": - for att in block["attachments"]: - file = att["attachment"].copy() - file["shared"] = shared - files.append(file) - elif type == "markdown": - content.append(block["markdown"]["content"]) - elif type == "ask": - post["ask"] = block["ask"] - else: - self.log.debug("%s: Unsupported block type '%s'", - post["postId"], type) - except Exception as exc: - self.log.debug("%s: %s", exc.__class__.__name__, exc) - - -class CohostUserExtractor(CohostExtractor): - """Extractor for media from a cohost user""" - subcategory = "user" - pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:$|\?|#)" - example = "https://cohost.org/USER" - - def posts(self): - empty = 0 - params = { - "projectHandle": self.groups[0], - "page": 0, - "options": { - "pinnedPostsAtTop" : True if self.pinned else False, - "hideReplies" : not self.replies, - "hideShares" : not self.shares, - "hideAsks" : not self.asks, - "viewingOnProjectPage": True, - }, - } - - while True: - data = self._request_api("posts.profilePosts", params) - - posts = data["posts"] - if posts: - empty = 0 - yield from posts - else: - empty += 1 - - pagination = data["pagination"] - if not pagination.get("morePagesForward"): - return - if empty >= 3: - return self.log.debug("Empty API results") - params["page"] = pagination["nextPage"] - - -class CohostPostExtractor(CohostExtractor): - """Extractor for media from a single cohost post""" - subcategory = "post" - pattern = BASE_PATTERN + r"/([^/?#]+)/post/(\d+)" - example = "https://cohost.org/USER/post/12345" - - def posts(self): - endpoint = "posts.singlePost" - params = { - "handle": self.groups[0], - "postId": int(self.groups[1]), - } - - data = self._request_api(endpoint, params) - post = data["post"] - - try: - post["comments"] = data["comments"][self.groups[1]] - except LookupError: - post["comments"] = () - - return (post,) - - -class CohostTagExtractor(CohostExtractor): - """Extractor for tagged posts""" - subcategory = "tag" - pattern = BASE_PATTERN + r"/([^/?#]+)/tagged/([^/?#]+)(?:\?([^#]+))?" - example = "https://cohost.org/USER/tagged/TAG" - - def posts(self): - user, tag, query = self.groups - url = "{}/{}/tagged/{}".format(self.root, user, tag) - params = text.parse_query(query) - post_feed_key = ("tagged-post-feed" if user == "rc" else - "project-tagged-post-feed") - - while True: - page = self.request(url, params=params).text - data = util.json_loads(text.extr( - page, 'id="__COHOST_LOADER_STATE__">', '</script>')) - - try: - feed = data[post_feed_key] - except KeyError: - feed = data.popitem()[1] - - yield from feed["posts"] - - pagination = feed["paginationMode"] - if not pagination.get("morePagesForward"): - return - params["refTimestamp"] = pagination["refTimestamp"] - params["skipPosts"] = \ - pagination["currentSkip"] + pagination["idealPageStride"] - - -class CohostLikesExtractor(CohostExtractor): - """Extractor for liked posts""" - subcategory = "likes" - pattern = BASE_PATTERN + r"/rc/liked-posts" - example = "https://cohost.org/rc/liked-posts" - - def posts(self): - url = "{}/rc/liked-posts".format(self.root) - params = {} - - while True: - page = self.request(url, params=params).text - data = util.json_loads(text.extr( - page, 'id="__COHOST_LOADER_STATE__">', '</script>')) - - try: - feed = data["liked-posts-feed"] - except KeyError: - feed = data.popitem()[1] - - yield from feed["posts"] - - pagination = feed["paginationMode"] - if not pagination.get("morePagesForward"): - return - params["refTimestamp"] = pagination["refTimestamp"] - params["skipPosts"] = \ - pagination["currentSkip"] + pagination["idealPageStride"] diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 37b6747..d0a9397 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -32,7 +32,7 @@ class DanbooruExtractor(BaseExtractor): if isinstance(threshold, int): self.threshold = 1 if threshold < 1 else threshold else: - self.threshold = self.per_page + self.threshold = self.per_page - 20 username, api_key = self._get_auth_info() if username: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 8172f62..59b2d6d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -822,7 +822,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ username, folder["gallery_id"], public=False): cache[dev["deviationid"]] = dev if has_access else None - return cache[deviation["deviationid"]] + return cache.get(deviation["deviationid"]) def _unwatch_premium(self): for username in self.unwatch: diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 33e6ba8..eddcb12 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -8,7 +8,7 @@ """Extractors for https://e621.net/ and other e621 instances""" -from .common import Message +from .common import Extractor, Message from . import danbooru from ..cache import memcache from .. import text, util @@ -156,3 +156,20 @@ class E621FavoriteExtractor(E621Extractor): def posts(self): return self._pagination("/favorites.json", self.query) + + +class E621FrontendExtractor(Extractor): + """Extractor for alternative e621 frontends""" + basecategory = "E621" + category = "e621" + subcategory = "frontend" + pattern = r"(?:https?://)?e621\.(?:cc/\?tags|anthro\.fr/\?q)=([^&#]*)" + example = "https://e621.cc/?tags=TAG" + + def initialize(self): + pass + + def items(self): + url = "https://e621.net/posts?tags=" + self.groups[0] + data = {"_extractor": E621TagExtractor} + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index 2f3fdbf..1ec6adc 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -238,8 +238,9 @@ class FacebookExtractor(Extractor): return res - def extract_set(self, first_photo_id, set_id): - all_photo_ids = [first_photo_id] + def extract_set(self, set_data): + set_id = set_data["set_id"] + all_photo_ids = [set_data["first_photo_id"]] retries = 0 i = 0 @@ -252,7 +253,6 @@ class FacebookExtractor(Extractor): photo_page = self.photo_page_request_wrapper(photo_url).text photo = self.parse_photo_page(photo_page) - photo["set_id"] = set_id photo["num"] = i + 1 if self.author_followups: @@ -281,9 +281,11 @@ class FacebookExtractor(Extractor): retries = 0 else: retries = 0 + photo.update(set_data) + yield Message.Directory, photo yield Message.Url, photo["url"], photo - if photo["next_photo_id"] == "": + if not photo["next_photo_id"]: self.log.debug( "Can't find next image in the set. " "Extraction is over." @@ -322,15 +324,11 @@ class FacebookSetExtractor(FacebookExtractor): set_url = self.set_url_fmt.format(set_id=set_id) set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + if self.groups[2]: + set_data["first_photo_id"] = self.groups[2] - directory = self.parse_set_page(set_page) - - yield Message.Directory, directory - - yield from self.extract_set( - self.groups[2] or directory["first_photo_id"], - directory["set_id"] - ) + return self.extract_set(set_data) class FacebookPhotoExtractor(FacebookExtractor): @@ -436,13 +434,8 @@ class FacebookProfileExtractor(FacebookExtractor): if set_id: set_url = self.set_url_fmt.format(set_id=set_id) set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + return self.extract_set(set_data) - directory = self.parse_set_page(set_page) - - yield Message.Directory, directory - - yield from self.extract_set( - directory["first_photo_id"], directory["set_id"] - ) - else: - self.log.debug("Profile photos set ID not found.") + self.log.debug("Profile photos set ID not found.") + return iter(()) diff --git a/gallery_dl/extractor/fanleaks.py b/gallery_dl/extractor/fanleaks.py deleted file mode 100644 index 886e893..0000000 --- a/gallery_dl/extractor/fanleaks.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://fanleaks.club/""" - -from .common import Extractor, Message -from .. import text - - -class FanleaksExtractor(Extractor): - """Base class for Fanleaks extractors""" - category = "fanleaks" - directory_fmt = ("{category}", "{model}") - filename_fmt = "{model_id}_{id}.{extension}" - archive_fmt = "{model_id}_{id}" - root = "https://fanleaks.club" - - def __init__(self, match): - Extractor.__init__(self, match) - self.model_id = match.group(1) - - def extract_post(self, url): - extr = text.extract_from(self.request(url, notfound="post").text) - data = { - "model_id": self.model_id, - "model" : text.unescape(extr('text-lg">', "</a>")), - "id" : text.parse_int(self.id), - "type" : extr('type="', '"')[:5] or "photo", - } - url = extr('src="', '"') - yield Message.Directory, data - yield Message.Url, url, text.nameext_from_url(url, data) - - -class FanleaksPostExtractor(FanleaksExtractor): - """Extractor for individual posts on fanleaks.club""" - subcategory = "post" - pattern = r"(?:https?://)?(?:www\.)?fanleaks\.club/([^/?#]+)/(\d+)" - example = "https://fanleaks.club/MODEL/12345" - - def __init__(self, match): - FanleaksExtractor.__init__(self, match) - self.id = match.group(2) - - def items(self): - url = "{}/{}/{}".format(self.root, self.model_id, self.id) - return self.extract_post(url) - - -class FanleaksModelExtractor(FanleaksExtractor): - """Extractor for all posts from a fanleaks model""" - subcategory = "model" - pattern = (r"(?:https?://)?(?:www\.)?fanleaks\.club" - r"/(?!latest/?$)([^/?#]+)/?$") - example = "https://fanleaks.club/MODEL" - - def items(self): - page_num = 1 - page = self.request( - self.root + "/" + self.model_id, notfound="model").text - data = { - "model_id": self.model_id, - "model" : text.unescape(text.extr(page, 'mt-4">', "</h1>")), - "type" : "photo", - } - page_url = text.extr(page, "url: '", "'") - while True: - page = self.request("{}{}".format(page_url, page_num)).text - if not page: - return - - for item in text.extract_iter(page, '<a href="/', "</a>"): - self.id = id = text.extr(item, "/", '"') - if "/icon-play.svg" in item: - url = "{}/{}/{}".format(self.root, self.model_id, id) - yield from self.extract_post(url) - continue - - data["id"] = text.parse_int(id) - url = text.extr(item, 'src="', '"').replace( - "/thumbs/", "/", 1) - yield Message.Directory, data - yield Message.Url, url, text.nameext_from_url(url, data) - page_num += 1 diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py index 80478ca..43627e2 100644 --- a/gallery_dl/extractor/fapachi.py +++ b/gallery_dl/extractor/fapachi.py @@ -33,7 +33,8 @@ class FapachiPostExtractor(Extractor): } page = self.request("{}/{}/media/{}".format( self.root, self.user, self.id)).text - url = self.root + text.extr(page, 'd-block" src="', '"') + url = self.root + text.extract( + page, 'data-src="', '"', page.index('class="media-img'))[0] yield Message.Directory, data yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index c939a3c..f15aab7 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://hipertoon.com/""" +"""Extractors for https://hiperdex.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -20,7 +20,7 @@ BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://hipertoon.com" + root = "https://hiperdex.com" @memcache(keyarg=1) def manga_data(self, manga, page=None): @@ -49,7 +49,7 @@ class HiperdexBase(): "status" : extr( 'class="summary-content">', '<').strip(), "description": text.remove_html(text.unescape(extr( - "Summary </h5>", "</div>"))), + '<div class="description-summary">', "</div>"))), "language": "English", "lang" : "en", } @@ -69,7 +69,7 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for hiperdex manga chapters""" pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" - example = "https://hipertoon.com/manga/MANGA/CHAPTER/" + example = "https://hiperdex.com/manga/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -91,7 +91,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for hiperdex manga""" chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" - example = "https://hipertoon.com/manga/MANGA/" + example = "https://hiperdex.com/manga/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() @@ -127,7 +127,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): chapterclass = HiperdexMangaExtractor reverse = False pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" - example = "https://hipertoon.com/manga-artist/NAME/" + example = "https://hiperdex.com/manga-artist/NAME/" def __init__(self, match): self.root = text.ensure_http_scheme(match.group(1)) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 5f1e0f4..d6b36cb 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -286,6 +286,34 @@ class TurboimagehostImageExtractor(ImagehostImageExtractor): return url, url +class TurboimagehostGalleryExtractor(ImagehostImageExtractor): + """Extractor for image galleries from turboimagehost.com""" + category = "turboimagehost" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com" + r"/album/(\d+)/([^/?#]*))") + example = "https://www.turboimagehost.com/album/12345/GALLERY_NAME" + + def items(self): + data = {"_extractor": TurboimagehostImageExtractor} + params = {"p": 1} + + while True: + page = self.request(self.page_url, params=params).text + + if params["p"] == 1 and \ + "Requested gallery don`t exist on our website." in page: + raise exception.NotFoundError("gallery") + + thumb_url = None + for thumb_url in text.extract_iter(page, '"><a href="', '"'): + yield Message.Queue, thumb_url, data + if thumb_url is None: + return + + params["p"] += 1 + + class ViprImageExtractor(ImagehostImageExtractor): """Extractor for single images from vipr.im""" category = "vipr" diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index 54c6539..b900113 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -54,26 +54,30 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): class IssuuUserExtractor(IssuuBase, Extractor): """Extractor for all publications of a user/publisher""" subcategory = "user" - pattern = r"(?:https?://)?issuu\.com/([^/?#]+)/?$" + pattern = r"(?:https?://)?issuu\.com/([^/?#]+)(?:/(\d*))?$" example = "https://issuu.com/USER" - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) - def items(self): - url = "{}/call/profile/v1/documents/{}".format(self.root, self.user) - params = {"offset": 0, "limit": "25"} + user, pnum = self.groups + base = self.root + "/" + user + pnum = text.parse_int(pnum, 1) while True: - data = self.request(url, params=params).json() + url = base + "/" + str(pnum) if pnum > 1 else base + try: + html = self.request(url).text + data = util.json_loads(text.unescape(text.extr( + html, '</main></div><script data-json="', '" id="'))) + docs = data["docs"] + except Exception as exc: + self.log.debug("", exc_info=exc) + return - for publication in data["items"]: - publication["url"] = "{}/{}/docs/{}".format( - self.root, self.user, publication["uri"]) + for publication in docs: + url = self.root + "/" + publication["uri"] publication["_extractor"] = IssuuPublicationExtractor - yield Message.Queue, publication["url"], publication + yield Message.Queue, url, publication - if not data["hasMore"]: + if len(docs) < 48: return - params["offset"] += data["limit"] + pnum += 1 diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 66bbab5..788b5d9 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -54,26 +54,19 @@ class KemonopartyExtractor(Extractor): sort_keys=True, separators=(",", ":")).encode def items(self): - service = self.groups[2] - creator_id = self.groups[3] - find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) announcements = True if self.config("announcements") else None comments = True if self.config("comments") else False duplicates = True if self.config("duplicates") else False dms = True if self.config("dms") else None - profile = username = None + max_posts = self.config("max-posts") + creator_info = {} if self.config("metadata") else None # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} - if self.config("metadata"): - profile = self.api.creator_profile(service, creator_id) - username = profile["name"] - posts = self.posts() - max_posts = self.config("max-posts") if max_posts: posts = itertools.islice(posts, max_posts) if self.revisions: @@ -85,10 +78,20 @@ class KemonopartyExtractor(Extractor): post["_http_headers"] = headers post["date"] = self._parse_datetime( post.get("published") or post.get("added") or "") + service = post["service"] + creator_id = post["user"] + + if creator_info is not None: + key = "{}_{}".format(service, creator_id) + if key not in creator_info: + creator = creator_info[key] = self.api.creator_profile( + service, creator_id) + else: + creator = creator_info[key] + + post["user_profile"] = creator + post["username"] = creator["name"] - if profile is not None: - post["username"] = username - post["user_profile"] = profile if comments: try: post["comments"] = self.api.creator_post_comments( @@ -171,7 +174,7 @@ class KemonopartyExtractor(Extractor): try: msg = '"' + response.json()["error"] + '"' except Exception: - msg = '"0/1 Username or password is incorrect"' + msg = '"Username or password is incorrect"' raise exception.AuthenticationError(msg) return {c.name: c.value for c in response.cookies} @@ -296,8 +299,12 @@ class KemonopartyUserExtractor(KemonopartyExtractor): def posts(self): _, _, service, creator_id, query = self.groups params = text.parse_query(query) - return self.api.creator_posts( - service, creator_id, params.get("o"), params.get("q")) + if params.get("tag"): + return self.api.creator_tagged_posts( + service, creator_id, params.get("tag"), params.get("o")) + else: + return self.api.creator_posts( + service, creator_id, params.get("o"), params.get("q")) class KemonopartyPostsExtractor(KemonopartyExtractor): @@ -493,7 +500,7 @@ class KemonoAPI(): def posts(self, offset=0, query=None, tags=None): endpoint = "/posts" - params = {"q": query, "o": offset, "tags": tags} + params = {"q": query, "o": offset, "tag": tags} return self._pagination(endpoint, params, 50, "posts") def creator_posts(self, service, creator_id, offset=0, query=None): @@ -501,6 +508,11 @@ class KemonoAPI(): params = {"q": query, "o": offset} return self._pagination(endpoint, params, 50) + def creator_tagged_posts(self, service, creator_id, tags, offset=0): + endpoint = "/{}/user/{}/posts-legacy".format(service, creator_id) + params = {"o": offset, "tag": tags} + return self._pagination(endpoint, params, 50, "results") + def creator_announcements(self, service, creator_id): endpoint = "/{}/user/{}/announcements".format(service, creator_id) return self._call(endpoint) @@ -565,9 +577,10 @@ class KemonoAPI(): data = self._call(endpoint, params) if key: - yield from data[key] - else: - yield from data + data = data.get(key) + if not data: + return + yield from data if len(data) < batch: return diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index d0c9c30..e779e97 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -36,22 +36,36 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): data = self.metadata(page) yield Message.Directory, data - for track in self.tracks(page): + + if self.config("covers", False): + for num, url in enumerate(self._extract_covers(page), 1): + cover = text.nameext_from_url( + url, {"url": url, "num": num, "type": "cover"}) + cover.update(data) + yield Message.Url, url, cover + + for track in self._extract_tracks(page): track.update(data) + track["type"] = "track" yield Message.Url, track["url"], track def metadata(self, page): extr = text.extract_from(page) return {"album": { "name" : text.unescape(extr("<h2>", "<")), - "platform": extr("Platforms: <a", "<").rpartition(">")[2], + "platform": text.split_html(extr("Platforms: ", "<br>"))[::2], + "year": extr("Year: <b>", "<"), + "catalog": extr("Catalog Number: <b>", "<"), + "developer": text.remove_html(extr(" Developed by: ", "</")), + "publisher": text.remove_html(extr(" Published by: ", "</")), "count": text.parse_int(extr("Number of Files: <b>", "<")), "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]), "date" : extr("Date Added: <b>", "<"), "type" : text.remove_html(extr("Album type: <b>", "</b>")), + "uploader": text.remove_html(extr("Uploaded by: ", "</")), }} - def tracks(self, page): + def _extract_tracks(self, page): fmt = self.config("format", ("mp3",)) if fmt and isinstance(fmt, str): if fmt == "all": @@ -75,3 +89,9 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): yield track if first: yield first + + def _extract_covers(self, page): + return [ + text.unescape(text.extr(cover, ' href="', '"')) + for cover in text.extract_iter(page, ' class="albumImage', '</') + ] diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index e39e272..89a1b5e 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,20 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.cz/""" +"""Extractors for https://komikcast.la/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:cz|lol|site|mo?e|com)" +BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" + r"komikcast\.(?:la|cz|lol|site|mo?e|com)") class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.cz" + root = "https://komikcast.la" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -48,7 +49,7 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): """Extractor for komikcast manga chapters""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" - example = "https://komikcast.cz/chapter/TITLE/" + example = "https://komikcast.la/chapter/TITLE/" def metadata(self, page): info = text.extr(page, "<title>", " - Komikcast<") @@ -68,7 +69,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): """Extractor for komikcast manga""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" - example = "https://komikcast.cz/komik/TITLE" + example = "https://komikcast.la/komik/TITLE" def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py index 412b6b9..b92a6ff 100644 --- a/gallery_dl/extractor/lofter.py +++ b/gallery_dl/extractor/lofter.py @@ -23,6 +23,8 @@ class LofterExtractor(Extractor): def items(self): for post in self.posts(): + if post is None: + continue if "post" in post: post = post["post"] @@ -129,6 +131,9 @@ class LofterAPI(): url, method="POST", params=params, data=data) info = response.json() + if info["meta"]["status"] == 4200: + raise exception.NotFoundError("blog") + if info["meta"]["status"] != 200: self.extractor.log.debug("Server response: %s", info) raise exception.StopExtraction("API request failed") @@ -142,6 +147,9 @@ class LofterAPI(): yield from posts + if data["offset"] < 0: + break + if params["offset"] + len(posts) < data["offset"]: break params["offset"] = data["offset"] diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 295b9c4..6a9f633 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -53,7 +53,14 @@ class LolisafeAlbumExtractor(LolisafeExtractor): if "name" in file: name = file["name"] file["name"] = name.rpartition(".")[0] or name - file["id"] = file["filename"].rpartition("-")[2] + _, sep, fid = file["filename"].rpartition("-") + if not sep or len(fid) == 12: + if "id" not in file: + file["id"] = "" + file["filename"] = file["name"] + else: + file["id"] = fid + file["filename"] = file["name"] + "-" + fid elif "id" in file: file["name"] = file["filename"] file["filename"] = "{}-{}".format(file["name"], file["id"]) diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index d590753..827756a 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -30,7 +30,7 @@ class MangafoxChapterExtractor(ChapterExtractor): def metadata(self, page): manga, pos = text.extract(page, "<title>", "</title>") count, pos = text.extract( - page, ">", "<", page.find("</select>", pos) - 20) + page, ">", "<", page.find("</select>", pos) - 40) sid , pos = text.extract(page, "var series_id =", ";", pos) cid , pos = text.extract(page, "var chapter_id =", ";", pos) @@ -49,9 +49,9 @@ class MangafoxChapterExtractor(ChapterExtractor): pnum = 1 while True: url, pos = text.extract(page, '<img src="', '"') - yield text.ensure_http_scheme(url), None + yield text.ensure_http_scheme(text.unescape(url)), None url, pos = text.extract(page, ' src="', '"', pos) - yield text.ensure_http_scheme(url), None + yield text.ensure_http_scheme(text.unescape(url)), None pnum += 2 page = self.request("{}/{}.html".format(self.urlbase, pnum)).text diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index e8ee861..8c94f04 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -37,7 +37,7 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor): def metadata(self, page): pos = page.index("</select>") - count , pos = text.extract(page, ">", "<", pos - 20) + count , pos = text.extract(page, ">", "<", pos - 40) manga_id , pos = text.extract(page, "series_id = ", ";", pos) chapter_id, pos = text.extract(page, "chapter_id = ", ";", pos) manga , pos = text.extract(page, '"name":"', '"', pos) @@ -61,9 +61,9 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor): while True: url, pos = text.extract(page, '<img src="', '"') - yield text.ensure_http_scheme(url), None + yield text.ensure_http_scheme(text.unescape(url)), None url, pos = text.extract(page, ' src="', '"', pos) - yield text.ensure_http_scheme(url), None + yield text.ensure_http_scheme(text.unescape(url)), None pnum += 2 page = self.request(self.url_fmt.format(self.part, pnum)).text diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 4b017dc..6970b4f 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -92,9 +92,9 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor): "genres" : list(text.extract_iter( extr('class="genres-content">', "</div>"), '"tag">', "</a>")), "type" : text.remove_html( - extr("Type </h5>\n</div>", "</div>")), + extr(" Type ", "\n</div>")), "release" : text.parse_int(text.remove_html( - extr("Release </h5>\n</div>", "</div>"))), + extr(" Release ", "\n</div>"))), "status" : text.remove_html( - extr("Status </h5>\n</div>", "</div>")), + extr(" Status ", "\n</div>")), } diff --git a/gallery_dl/extractor/nekohouse.py b/gallery_dl/extractor/nekohouse.py new file mode 100644 index 0000000..fe9d512 --- /dev/null +++ b/gallery_dl/extractor/nekohouse.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nekohouse.su/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?nekohouse\.su" +USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" + + +class NekohouseExtractor(Extractor): + """Base class for nekohouse extractors""" + category = "nekohouse" + root = "https://nekohouse.su" + + +class NekohousePostExtractor(NekohouseExtractor): + subcategory = "post" + directory_fmt = ("{category}", "{service}", "{username} ({user_id})", + "{post_id} {date} {title[b:230]}") + filename_fmt = "{num:>02} {id|filename}.{extension}" + archive_fmt = "{service}_{user_id}_{post_id}_{hash}" + pattern = USER_PATTERN + r"/post/([^/?#]+)" + example = "https://nekohouse.su/SERVICE/user/12345/post/12345" + + def items(self): + service, user_id, post_id = self.groups + url = "{}/{}/user/{}/post/{}".format( + self.root, service, user_id, post_id) + html = self.request(url).text + + files = self._extract_files(html) + post = self._extract_post(html) + post["service"] = service + post["user_id"] = user_id + post["post_id"] = post_id + post["count"] = len(files) + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + url = file["url"] + text.nameext_from_url(url, file) + file["hash"] = file["filename"] + file.update(post) + if "name" in file: + text.nameext_from_url(file.pop("name"), file) + yield Message.Url, url, file + + def _extract_post(self, html): + extr = text.extract_from(html) + return { + "username": text.unescape(extr( + 'class="scrape__user-name', '</').rpartition(">")[2].strip()), + "title" : text.unescape(extr( + 'class="scrape__title', '</').rpartition(">")[2]), + "date" : text.parse_datetime(extr( + 'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"), + "content": text.unescape(extr( + 'class="scrape__content">', "</div>").strip()), + } + + def _extract_files(self, html): + files = [] + + extr = text.extract_from(text.extr( + html, 'class="scrape__files"', "<footer")) + while True: + file_id = extr('<a href="/post/', '"') + if not file_id: + break + files.append({ + "id" : file_id, + "url" : self.root + extr('href="', '"'), + "type": "file", + }) + + extr = text.extract_from(text.extr( + html, 'class="scrape__attachments"', "</ul>")) + while True: + url = extr('href="', '"') + if not url: + break + files.append({ + "id" : "", + "url" : self.root + url, + "name": text.unescape(extr('download="', '"')), + "type": "attachment", + }) + + return files + + +class NekohouseUserExtractor(NekohouseExtractor): + subcategory = "user" + pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)" + example = "https://nekohouse.su/SERVICE/user/12345" + + def items(self): + service, user_id, _ = self.groups + creator_url = "{}/{}/user/{}".format(self.root, service, user_id) + params = {"o": 0} + + data = {"_extractor": NekohousePostExtractor} + while True: + html = self.request(creator_url, params=params).text + + cnt = 0 + for post in text.extract_iter(html, "<article", "</article>"): + cnt += 1 + post_url = self.root + text.extr(post, '<a href="', '"') + yield Message.Queue, post_url, data + + if cnt < 50: + return + params["o"] += 50 diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index d3e40ee..7fe8869 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -27,8 +27,10 @@ class PixivExtractor(Extractor): filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" cookies_domain = ".pixiv.net" - sanity_url = "https://s.pximg.net/common/images/limit_sanity_level_360.png" - mypixiv_url = "https://s.pximg.net/common/images/limit_mypixiv_360.png" + limit_url = "https://s.pximg.net/common/images/limit_" + # https://s.pximg.net/common/images/limit_sanity_level_360.png + # https://s.pximg.net/common/images/limit_unviewable_360.png + # https://s.pximg.net/common/images/limit_mypixiv_360.png def _init(self): self.api = PixivAppAPI(self) @@ -117,16 +119,30 @@ class PixivExtractor(Extractor): ] url = meta_single_page["original_image_url"] - if url == self.sanity_url: - work["_ajax"] = True - self.log.warning("%s: 'limit_sanity_level' warning", work["id"]) - if self.sanity_workaround: - body = self._request_ajax("/illust/" + str(work["id"])) - return self._extract_ajax(work, body) + if url.startswith(self.limit_url): + work_id = work["id"] + self.log.debug("%s: %s", work_id, url) + + limit_type = url.rpartition("/")[2] + if limit_type in ( + "limit_", # for '_extend_sanity()' inserts + "limit_unviewable_360.png", + "limit_sanity_level_360.png", + ): + work["_ajax"] = True + self.log.warning("%s: 'limit_sanity_level' warning", work_id) + if self.sanity_workaround: + body = self._request_ajax("/illust/" + str(work_id)) + return self._extract_ajax(work, body) + + elif limit_type == "limit_mypixiv_360.png": + work["_mypixiv"] = True + self.log.warning("%s: 'My pixiv' locked", work_id) - elif url == self.mypixiv_url: - work["_mypixiv"] = True - self.log.warning("%s: 'My pixiv' locked", work["id"]) + else: + work["_mypixiv"] = True # stop further processing + self.log.error("%s: Unknown 'limit' URL type: %s", + work_id, limit_type) elif work["type"] != "ugoira": return ({"url": url, "_fallback": self._fallback_image(url)},) @@ -430,7 +446,7 @@ class PixivArtworksExtractor(PixivExtractor): elif ajax_id > work_id: index -= 1 self.log.debug("Inserting work %s", ajax_id) - yield self._make_work(ajax_id, self.sanity_url, user) + yield self._make_work(ajax_id, self.limit_url, user) else: # ajax_id < work_id break @@ -440,7 +456,7 @@ class PixivArtworksExtractor(PixivExtractor): while index >= 0: ajax_id = ajax_ids[index] self.log.debug("Inserting work %s", ajax_id) - yield self._make_work(ajax_id, self.sanity_url, user) + yield self._make_work(ajax_id, self.limit_url, user) index -= 1 diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py index 83f3064..863ef3b 100644 --- a/gallery_dl/extractor/pornpics.py +++ b/gallery_dl/extractor/pornpics.py @@ -20,10 +20,6 @@ class PornpicsExtractor(Extractor): root = "https://www.pornpics.com" request_interval = (0.5, 1.5) - def __init__(self, match): - super().__init__(match) - self.item = match.group(1) - def items(self): for gallery in self.galleries(): gallery["_extractor"] = PornpicsGalleryExtractor @@ -34,9 +30,11 @@ class PornpicsExtractor(Extractor): # fetch first 20 galleries from HTML # since '"offset": 0' does not return a JSON response page = self.request(url).text - for path in text.extract_iter( + for href in text.extract_iter( page, 'class="rel-link" href="', '"'): - yield {"g_url": self.root + path} + if href[0] == "/": + href = self.root + href + yield {"g_url": href} del page params = {"offset": 20} @@ -60,12 +58,12 @@ class PornpicsExtractor(Extractor): class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor): """Extractor for pornpics galleries""" - pattern = BASE_PATTERN + r"(/galleries/(?:[^/?#]+-)?(\d+))" + pattern = BASE_PATTERN + r"/galleries/((?:[^/?#]+-)?(\d+))" example = "https://www.pornpics.com/galleries/TITLE-12345/" def __init__(self, match): - PornpicsExtractor.__init__(self, match) - self.gallery_id = match.group(2) + url = "{}/galleries/{}/".format(self.root, match.group(1)) + GalleryExtractor.__init__(self, match, url) items = GalleryExtractor.items @@ -73,7 +71,7 @@ class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor): extr = text.extract_from(page) return { - "gallery_id": text.parse_int(self.gallery_id), + "gallery_id": text.parse_int(self.groups[1]), "slug" : extr("/galleries/", "/").rpartition("-")[0], "title" : text.unescape(extr("<h1>", "<")), "channel" : text.split_html(extr(">Channel: ", '</div>')), @@ -100,7 +98,7 @@ class PornpicsTagExtractor(PornpicsExtractor): example = "https://www.pornpics.com/tags/TAGS/" def galleries(self): - url = "{}/tags/{}/".format(self.root, self.item) + url = "{}/tags/{}/".format(self.root, self.groups[0]) return self._pagination(url) @@ -113,7 +111,7 @@ class PornpicsSearchExtractor(PornpicsExtractor): def galleries(self): url = self.root + "/search/srch.php" params = { - "q" : self.item.replace("-", " "), + "q" : self.groups[0].replace("-", " "), "lang" : "en", "offset": 0, } diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py index f1e7518..3b8d344 100644 --- a/gallery_dl/extractor/rule34xyz.py +++ b/gallery_dl/extractor/rule34xyz.py @@ -60,18 +60,22 @@ class Rule34xyzExtractor(BooruExtractor): post.pop("filesPreview", None) post.pop("tagsWithType", None) post["date"] = text.parse_datetime( - post["created"], "%Y-%m-%dT%H:%M:%S.%f") + post["created"][:19], "%Y-%m-%dT%H:%M:%S") def _tags(self, post, _): if post.get("tagsWithType") is None: post.update(self._fetch_post(post["id"])) tags = collections.defaultdict(list) + tagslist = [] for tag in post["tagsWithType"]: - tags[tag["type"]].append(tag["value"]) + value = tag["value"] + tagslist.append(value) + tags[tag["type"]].append(value) types = self.TAG_TYPES for type, values in tags.items(): post["tags_" + types[type]] = values + post["tags"] = tagslist def _fetch_post(self, post_id): url = "{}/api/post/{}".format(self.root, post_id) diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py index 1c62d75..5ec2443 100644 --- a/gallery_dl/extractor/saint.py +++ b/gallery_dl/extractor/saint.py @@ -81,6 +81,7 @@ class SaintMediaExtractor(SaintAlbumExtractor): else: # /d/ file = { "file" : text.unescape(extr('<a href="', '"')), + "id" : album_id, "id_dl" : album_id, "name" : album_id, "filename" : album_id, diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index 97bad09..d15762d 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -74,10 +74,6 @@ BASE_PATTERN = Shimmie2Extractor.update({ "pattern": r"(?:sizechange|giantess)booru\.com", "cookies": {"agreed": "true"}, }, - "tentaclerape": { - "root": "https://tentaclerape.net", - "pattern": r"tentaclerape\.net", - }, "cavemanon": { "root": "https://booru.cavemanon.xyz", "pattern": r"booru\.cavemanon\.xyz", diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index b122f26..1713509 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -79,10 +79,6 @@ class SzurubooruExtractor(booru.BooruExtractor): BASE_PATTERN = SzurubooruExtractor.update({ - "foalcon": { - "root": "https://booru.foalcon.com", - "pattern": r"booru\.foalcon\.com", - }, "bcbnsfw": { "root": "https://booru.bcbnsfw.space", "pattern": r"booru\.bcbnsfw\.space", @@ -104,7 +100,7 @@ class SzurubooruTagExtractor(SzurubooruExtractor): directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}_{version}" pattern = BASE_PATTERN + r"/posts(?:/query=([^/?#]*))?" - example = "https://booru.foalcon.com/posts/query=TAG" + example = "https://booru.bcbnsfw.space/posts/query=TAG" def __init__(self, match): SzurubooruExtractor.__init__(self, match) @@ -127,7 +123,7 @@ class SzurubooruPostExtractor(SzurubooruExtractor): subcategory = "post" archive_fmt = "{id}_{version}" pattern = BASE_PATTERN + r"/post/(\d+)" - example = "https://booru.foalcon.com/post/12345" + example = "https://booru.bcbnsfw.space/post/12345" def posts(self): return (self._api_request("/post/" + self.groups[-1]),) diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 44d87ee..cee0d9d 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -52,16 +52,18 @@ class ToyhouseExtractor(Extractor): return { "url": extr(needle, '"'), "date": text.parse_datetime(extr( - 'Credits\n</h2>\n<div class="mb-1">', '<'), + '</h2>\n <div class="mb-1">', '<'), "%d %b %Y, %I:%M:%S %p"), "artists": [ text.remove_html(artist) for artist in extr( - '<div class="artist-credit">', '</div>\n</div>').split( - '<div class="artist-credit">') + '<div class="artist-credit">', + '</div>\n </div>').split( + '<div class="ar tist-credit">') ], "characters": text.split_html(extr( - '<div class="image-characters', '</div>\n</div>'))[2:], + '<div class="image-characters', + '<div class="image-comments">'))[2:], } def _pagination(self, path): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 090b11a..840e846 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -121,14 +121,7 @@ class TwitterExtractor(Extractor): txt = data.get("full_text") or data.get("text") or "" self.log.warning("'%s' (%s)", txt, data["id_str"]) - files = [] - if "extended_entities" in data: - self._extract_media( - data, data["extended_entities"]["media"], files) - if "card" in tweet and self.cards: - self._extract_card(tweet, files) - if self.twitpic: - self._extract_twitpic(data, files) + files = self._extract_files(data, tweet) if not files and not self.textonly: continue @@ -143,6 +136,39 @@ class TwitterExtractor(Extractor): text.nameext_from_url(url, file) yield Message.Url, url, file + def _extract_files(self, data, tweet): + files = [] + + if "extended_entities" in data: + try: + self._extract_media( + data, data["extended_entities"]["media"], files) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Error while extracting media files (%s: %s)", + data["id_str"], exc.__class__.__name__, exc) + + if self.cards and "card" in tweet: + try: + self._extract_card(tweet, files) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Error while extracting Card files (%s: %s)", + data["id_str"], exc.__class__.__name__, exc) + + if self.twitpic: + try: + self._extract_twitpic(data, files) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Error while extracting TwitPic files (%s: %s)", + data["id_str"], exc.__class__.__name__, exc) + + return files + def _extract_media(self, tweet, entities, files): for media in entities: @@ -1039,7 +1065,7 @@ class TwitterAPI(): else: csrf_token = None if not csrf_token: - csrf_token = util.generate_token() + csrf_token = util.generate_token(80) cookies.set("ct0", csrf_token, domain=cookies_domain) auth_token = cookies.get("auth_token", domain=cookies_domain) diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index bb80055..ebfeb9d 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -15,12 +15,15 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): category = "urlgalleries" root = "https://urlgalleries.net" request_interval = (0.5, 1.5) - pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" - example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE" + pattern = (r"(?:https?://)()(?:(\w+)\.)?urlgalleries\.net" + r"/(?:b/([^/?#]+)/)?(?:[\w-]+-)?(\d+)") + example = "https://urlgalleries.net/b/BLOG/gallery-12345/TITLE" def items(self): - blog, self.gallery_id = self.groups - url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( + _, blog_alt, blog, self.gallery_id = self.groups + if not blog: + blog = blog_alt + url = "https://urlgalleries.net/b/{}/porn-gallery-{}/?a=10000".format( blog, self.gallery_id) with self.request(url, allow_redirects=False, fatal=...) as response: @@ -35,7 +38,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): data = self.metadata(page) data["count"] = len(imgs) - root = "https://{}.urlgalleries.net".format(blog) + root = "https://urlgalleries.net/b/" + blog yield Message.Directory, data for data["num"], img in enumerate(imgs, 1): page = self.request(root + img).text diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 922a591..1c0c172 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -70,7 +70,8 @@ class VscoExtractor(Extractor): def _extract_preload_state(self, url): page = self.request(url, notfound=self.subcategory).text - return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<")) + return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<") + .replace('"prevPageToken":undefined,', '')) def _pagination(self, url, params, token, key, extra=None): headers = { diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 70ab259..008ae6e 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -102,8 +102,8 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): else: episode = "" - if extr('<div class="author_area"', '\n'): - username = extr('/creator/', '"') + if extr('<span class="author"', '\n'): + username = extr('/u/', '"') author_name = extr('<span>', '</span>') else: username = author_name = "" diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py index 39f998a..fc1badb 100644 --- a/gallery_dl/extractor/weebcentral.py +++ b/gallery_dl/extractor/weebcentral.py @@ -80,12 +80,12 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor): results = [] while True: - src = extr(' src="', '"') + src = extr('src="', '"') if not src: break results.append((src, { - "width" : text.parse_int(extr(' width="' , '"')), - "height": text.parse_int(extr(' height="', '"')), + "width" : text.parse_int(extr('width="' , '"')), + "height": text.parse_int(extr('height="', '"')), })) return results diff --git a/gallery_dl/extractor/xfolio.py b/gallery_dl/extractor/xfolio.py new file mode 100644 index 0000000..a1a5be3 --- /dev/null +++ b/gallery_dl/extractor/xfolio.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://xfolio.jp/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?xfolio\.jp(?:/[^/?#]+)?" + + +class XfolioExtractor(Extractor): + """Base class for xfolio extractors""" + category = "xfolio" + root = "https://xfolio.jp" + cookies_domain = ".xfolio.jp" + directory_fmt = ("{category}", "{creator_slug}", "{work_id}") + filename_fmt = "{work_id}_{image_id}.{extension}" + archive_fmt = "{work_id}_{image_id}" + request_interval = (0.5, 1.5) + + def _init(self): + XfolioExtractor._init = Extractor._init + if not self.cookies_check(("xfolio_session",)): + self.log.error("'xfolio_session' cookie required") + + def items(self): + data = {"_extractor": XfolioWorkExtractor} + for work in self.works(): + yield Message.Queue, work, data + + def request(self, url, **kwargs): + response = Extractor.request(self, url, **kwargs) + + if "/system/recaptcha" in response.url: + raise exception.StopExtraction("Bot check / CAPTCHA page") + + return response + + +class XfolioWorkExtractor(XfolioExtractor): + subcategory = "work" + pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)" + example = "https://xfolio.jp/portfolio/USER/works/12345" + ref_fmt = ("{}/fullscale_image?image_id={}&work_id={}") + url_fmt = ("{}/user_asset.php?id={}&work_id={}" + "&work_image_id={}&type=work_image") + + def items(self): + creator, work_id = self.groups + url = "{}/portfolio/{}/works/{}".format(self.root, creator, work_id) + html = self.request(url).text + + work = self._extract_data(html) + files = self._extract_files(html, work) + work["count"] = len(files) + + yield Message.Directory, work + for work["num"], file in enumerate(files, 1): + file.update(work) + yield Message.Url, file["url"], file + + def _extract_data(self, html): + creator, work_id = self.groups + extr = text.extract_from(html) + return { + "title" : text.unescape(extr( + 'property="og:title" content="', '"').rpartition(" - ")[0]), + "description" : text.unescape(extr( + 'property="og:description" content="', '"')), + "creator_id" : extr(' data-creator-id="', '"'), + "creator_userid" : extr(' data-creator-user-id="', '"'), + "creator_name" : extr(' data-creator-name="', '"'), + "creator_profile": text.unescape(extr( + ' data-creator-profile="', '"')), + "series_id" : extr("/series/", '"'), + "creator_slug" : creator, + "work_id" : work_id, + } + + def _extract_files(self, html, work): + files = [] + + work_id = work["work_id"] + for img in text.extract_iter( + html, 'class="article__wrap_img', "</div>"): + image_id = text.extr(img, "/fullscale_image?image_id=", "&") + if not image_id: + self.log.warning( + "%s: 'fullscale_image' not available", work_id) + continue + + files.append({ + "image_id" : image_id, + "extension": "jpg", + "url": self.url_fmt.format( + self.root, image_id, work_id, image_id), + "_http_headers": {"Referer": self.ref_fmt.format( + self.root, image_id, work_id)}, + }) + + return files + + +class XfolioUserExtractor(XfolioExtractor): + subcategory = "user" + pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)" + example = "https://xfolio.jp/portfolio/USER" + + def works(self): + url = "{}/portfolio/{}/works".format(self.root, self.groups[0]) + + while True: + html = self.request(url).text + + for item in text.extract_iter( + html, '<div class="postItem', "</div>"): + yield text.extr(item, ' href="', '"') + + pager = text.extr(html, ' class="pager__list_next', "</li>") + url = text.extr(pager, ' href="', '"') + if not url: + return + url = text.unescape(url) + + +class XfolioSeriesExtractor(XfolioExtractor): + subcategory = "series" + pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/series/(\d+)" + example = "https://xfolio.jp/portfolio/USER/series/12345" + + def works(self): + creator, series_id = self.groups + url = "{}/portfolio/{}/series/{}".format(self.root, creator, series_id) + html = self.request(url).text + + return [ + text.extr(item, ' href="', '"') + for item in text.extract_iter( + html, 'class="listWrap--title">', "</a>") + ] diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 6dc9362..4d69d3d 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -20,8 +20,8 @@ class XhamsterExtractor(Extractor): category = "xhamster" def __init__(self, match): - Extractor.__init__(self, match) self.root = "https://" + match.group(1) + Extractor.__init__(self, match) class XhamsterGalleryExtractor(XhamsterExtractor): @@ -34,48 +34,48 @@ class XhamsterGalleryExtractor(XhamsterExtractor): pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)" example = "https://xhamster.com/photos/gallery/12345" - def __init__(self, match): - XhamsterExtractor.__init__(self, match) - self.path = match.group(2) - self.data = None - def items(self): data = self.metadata() yield Message.Directory, data for num, image in enumerate(self.images(), 1): url = image["imageURL"] image.update(data) + text.nameext_from_url(url, image) image["num"] = num - yield Message.Url, url, text.nameext_from_url(url, image) + image["extension"] = "webp" + del image["modelName"] + yield Message.Url, url, image def metadata(self): - self.data = self._data(self.root + self.path) - user = self.data["authorModel"] - imgs = self.data["photosGalleryModel"] + data = self.data = self._extract_data(self.root + self.groups[1]) + + gallery = data["galleryPage"] + info = gallery["infoProps"] + model = gallery["galleryModel"] + author = info["authorInfoProps"] return { "user": { - "id" : text.parse_int(user["id"]), - "url" : user["pageURL"], - "name" : user["name"], - "retired" : user["retired"], - "verified" : user["verified"], - "subscribers": user["subscribers"], + "id" : text.parse_int(model["userId"]), + "url" : author["authorLink"], + "name" : author["authorName"], + "verified" : True if author.get("verified") else False, + "subscribers": info["subscribeButtonProps"]["subscribers"], }, "gallery": { - "id" : text.parse_int(imgs["id"]), - "tags" : [c["name"] for c in imgs["categories"]], - "date" : text.parse_timestamp(imgs["created"]), - "views" : text.parse_int(imgs["views"]), - "likes" : text.parse_int(imgs["rating"]["likes"]), - "dislikes" : text.parse_int(imgs["rating"]["dislikes"]), - "title" : text.unescape(imgs["title"]), - "description": text.unescape(imgs["description"]), - "thumbnail" : imgs["thumbURL"], + "id" : text.parse_int(gallery["id"]), + "tags" : [t["label"] for t in info["categoriesTags"]], + "date" : text.parse_timestamp(model["created"]), + "views" : text.parse_int(model["views"]), + "likes" : text.parse_int(model["rating"]["likes"]), + "dislikes" : text.parse_int(model["rating"]["dislikes"]), + "title" : model["title"], + "description": model["description"], + "thumbnail" : model["thumbURL"], }, - "count": text.parse_int(imgs["quantity"]), + "count": text.parse_int(gallery["photosCount"]), } def images(self): @@ -83,17 +83,17 @@ class XhamsterGalleryExtractor(XhamsterExtractor): self.data = None while True: - for image in data["photosGalleryModel"]["photos"]: - del image["modelName"] - yield image + yield from data["photosGalleryModel"]["photos"] - pgntn = data["pagination"] - if pgntn["active"] == pgntn["maxPage"]: + pagination = data["galleryPage"]["paginationProps"] + if pagination["currentPageNumber"] >= pagination["lastPageNumber"]: return - url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"]) - data = self._data(url) + url = (pagination["pageLinkTemplate"][:-3] + + str(pagination["currentPageNumber"] + 1)) + + data = self._extract_data(url) - def _data(self, url): + def _extract_data(self, url): page = self.request(url).text return util.json_loads(text.extr( page, "window.initials=", "</script>").rstrip("\n\r;")) @@ -105,12 +105,8 @@ class XhamsterUserExtractor(XhamsterExtractor): pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])" example = "https://xhamster.com/users/USER/photos" - def __init__(self, match): - XhamsterExtractor.__init__(self, match) - self.user = match.group(2) - def items(self): - url = "{}/users/{}/photos".format(self.root, self.user) + url = "{}/users/{}/photos".format(self.root, self.groups[1]) data = {"_extractor": XhamsterGalleryExtractor} while url: |
