diff options
Diffstat (limited to 'gallery_dl/extractor/thehentaiworld.py')
| -rw-r--r-- | gallery_dl/extractor/thehentaiworld.py | 139 |
1 files changed, 139 insertions, 0 deletions
diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py new file mode 100644 index 0000000..055d7d8 --- /dev/null +++ b/gallery_dl/extractor/thehentaiworld.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://thehentaiworld.com/""" + +from .common import Extractor, Message +from .. import text, util +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com" + + +class ThehentaiworldExtractor(Extractor): + """Base class for thehentaiworld extractors""" + category = "thehentaiworld" + root = "https://thehentaiworld.com" + filename_fmt = "{title} ({id}{num:?-//}).{extension}" + archive_fmt = "{id}_{num}" + request_interval = (0.5, 1.5) + + def items(self): + for url in self.posts(): + try: + post = self._extract_post(url) + except Exception as exc: + self.status |= 1 + self.log.warning("Failed to extract post %s (%s: %s)", + url, exc.__class__.__name__, exc) + continue + + if "file_urls" in post: + urls = post["file_urls"] + post["count"] = len(urls) + yield Message.Directory, post + for post["num"], url in enumerate(urls, 1): + text.nameext_from_url(url, post) + yield Message.Url, url, post + else: + yield Message.Directory, post + url = post["file_url"] + text.nameext_from_url(url, post) + yield Message.Url, url, post + + def _extract_post(self, url): + extr = text.extract_from(self.request(url).text) + + post = { + "num" : 0, + "count" : 1, + "title" : text.unescape(extr("<title>", "<").strip()), + "id" : text.parse_int(extr(" postid-", " ")), + "slug" : extr(" post-", '"'), + "tags" : extr('id="tagsHead">', "</ul>"), + "date" : text.parse_datetime(extr( + "<li>Posted: ", "<"), "%Y-%m-%d"), + } + + if "/videos/" in url: + post["type"] = "video" + post["width"] = post["height"] = 0 + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + post["file_url"] = extr('<source src="', '"') + else: + post["type"] = "image" + post["width"] = text.parse_int(extr("<li>Size: ", " ")) + post["height"] = text.parse_int(extr("x ", "<")) + post["file_url"] = extr('a href="', '"') + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + + if doujin := extr('<a id="prev-page"', "</div></div><"): + repl = text.re(r"-220x\d+\.").sub + post["file_urls"] = [ + repl(".", url) + for url in text.extract_iter( + doujin, 'class="border" src="', '"') + ] + + tags = collections.defaultdict(list) + pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)') + for tag_type, tag_name in pattern.findall(post["tags"]): + tags[tag_type].append(tag_name) + post["tags"] = tags_list = [] + for key, value in tags.items(): + tags_list.extend(value) + post[f"tags_{key}" if key else "tags_general"] = value + + return post + + def _pagination(self, endpoint): + base = f"{self.root}{endpoint}" + pnum = self.page_start + + while True: + url = base if pnum < 2 else f"{base}page/{pnum}/" + page = self.request(url).text + + yield from text.extract_iter(text.extr( + page, 'id="thumbContainer"', "<script"), ' href="', '"') + + if 'class="next"' not in page: + return + pnum += 1 + + +class ThehentaiworldPostExtractor(ThehentaiworldExtractor): + subcategory = "post" + pattern = (rf"{BASE_PATTERN}" + rf"(/(?:(?:3d-cgi-)?hentai-image|video)s/([^/?#]+))") + example = "https://thehentaiworld.com/hentai-images/SLUG/" + + def posts(self): + return (f"{self.root}{self.groups[0]}/",) + + +class ThehentaiworldTagExtractor(ThehentaiworldExtractor): + subcategory = "tag" + per_page = 24 + page_start = 1 + post_start = 0 + directory_fmt = ("{category}", "{search_tags}") + pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)" + example = "https://thehentaiworld.com/tag/TAG/" + + def posts(self): + self.kwdict["search_tags"] = tag = self.groups[0] + return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start) + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.page_start += pages + self.post_start += posts + return num |
