# -*- coding: utf-8 -*- # Copyright 2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://thehentaiworld.com/""" from .common import Extractor, Message from .. import text, util import collections BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com" class ThehentaiworldExtractor(Extractor): """Base class for thehentaiworld extractors""" category = "thehentaiworld" root = "https://thehentaiworld.com" filename_fmt = "{title} ({id}{num:?-//}).{extension}" archive_fmt = "{id}_{num}" request_interval = (0.5, 1.5) def items(self): for url in self.posts(): try: post = self._extract_post(url) except Exception as exc: self.status |= 1 self.log.warning("Failed to extract post %s (%s: %s)", url, exc.__class__.__name__, exc) continue if "file_urls" in post: urls = post["file_urls"] post["count"] = len(urls) yield Message.Directory, "", post for post["num"], url in enumerate(urls, 1): text.nameext_from_url(url, post) yield Message.Url, url, post else: yield Message.Directory, "", post url = post["file_url"] text.nameext_from_url(url, post) yield Message.Url, url, post def _extract_post(self, url): extr = text.extract_from(self.request(url).text) post = { "num" : 0, "count" : 1, "title" : text.unescape(extr("