diff options
| author | 2025-03-29 07:20:04 -0400 | |
|---|---|---|
| committer | 2025-03-29 07:20:04 -0400 | |
| commit | 5ea6cce4fb40d2cc4f1d7849e44e6825ac2f3a73 (patch) | |
| tree | 2d7040d732323306b2227682068ed5c9e12d4bf0 /gallery_dl/extractor/imhentai.py | |
| parent | 68863e88e0e0d8c08a8631831c05c302527627b1 (diff) | |
| parent | 662e5ac868a5c1a3e7bc95b37054b3a0ca4db74f (diff) | |
Update upstream source from tag 'upstream/1.29.3'
Update to upstream version '1.29.3'
with Debian dir 131b9b3bdbc67af5fe84f139a5b499a550f7c22b
Diffstat (limited to 'gallery_dl/extractor/imhentai.py')
| -rw-r--r-- | gallery_dl/extractor/imhentai.py | 50 |
1 files changed, 36 insertions, 14 deletions
diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index 0439f5b..1b0fba3 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -22,10 +22,15 @@ class ImhentaiExtractor(BaseExtractor): while True: page = self.request(url).text + + pos = page.find('class="ranking_list"') + if pos >= 0: + page = page[:pos] + extr = text.extract_from(page) while True: - gallery_id = extr('<a href="/gallery/', '"') + gallery_id = extr('href="/gallery/', '"') if gallery_id == prev: continue if not gallery_id: @@ -57,6 +62,18 @@ BASE_PATTERN = ImhentaiExtractor.update({ "root": "https://hentairox.com", "pattern": r"(?:www\.)?hentairox\.com", }, + "hentaifox": { + "root": "https://hentaifox.com", + "pattern": r"(?:www\.)?hentaifox\.com", + }, + "hentaienvy": { + "root": "https://hentaienvy.com", + "pattern": r"(?:www\.)?hentaienvy\.com", + }, + "hentaizap": { + "root": "https://hentaizap.com", + "pattern": r"(?:www\.)?hentaizap\.com", + }, }) @@ -72,17 +89,20 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) + title = extr("<h1>", "<") + title_alt = extr('class="subtitle">', "<") + end = "</li>" if extr('<ul class="galleries_info', ">") else "</ul>" data = { "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr("<h1>", "<")), - "title_alt" : text.unescape(extr('class="subtitle">', "<")), - "parody" : self._split(extr(">Parodies", "</li>")), - "character" : self._split(extr(">Characters", "</li>")), - "tags" : self._split(extr(">Tags", "</li>")), - "artist" : self._split(extr(">Artists", "</li>")), - "group" : self._split(extr(">Groups", "</li>")), - "language" : self._split(extr(">Languages", "</li>")), + "title" : text.unescape(title), + "title_alt" : text.unescape(title_alt), + "parody" : self._split(extr(">Parodies", end)), + "character" : self._split(extr(">Characters", end)), + "tags" : self._split(extr(">Tags", end)), + "artist" : self._split(extr(">Artists", end)), + "group" : self._split(extr(">Groups", end)), + "language" : self._split(extr(">Languages", end)), "type" : extr("href='/category/", "/"), } @@ -94,10 +114,12 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def _split(self, html): results = [] for tag in text.extract_iter(html, ">", "</a>"): - tag = tag.partition(" <span class='badge'>")[0] - if "<" in tag: - tag = text.remove_html(tag) + badge = ("badge'>" in tag or "class='badge" in tag) + tag = text.remove_html(tag) + if badge: + tag = tag.rpartition(" ")[0] results.append(tag) + results.sort() return results def images(self, page): @@ -132,9 +154,9 @@ class ImhentaiTagExtractor(ImhentaiExtractor): class ImhentaiSearchExtractor(ImhentaiExtractor): """Extractor for imhentai search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + pattern = BASE_PATTERN + r"/search(/?\?[^#]+|/[^/?#]+/?)" example = "https://imhentai.xxx/search/?key=QUERY" def items(self): - url = self.root + "/search/?" + self.groups[-1] + url = self.root + "/search" + self.groups[-1] return self._pagination(url) |
