gallery_dl/extractor/ahottie.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

# -*- coding: utf-8 -*-

# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://ahottie.top/"""

from .common import Extractor, GalleryExtractor, Message
from .. import text

BASE_PATTERN = r"(?:https?://)?(?:www\.)?ahottie\.top"


class AhottieExtractor(Extractor):
    """Base class for ahottie extractors"""
    category = "ahottie"
    root = "https://ahottie.top"

    def items(self):
        for album in self.albums():
            yield Message.Queue, album["url"], album

    def _pagination(self, url, params):
        params["page"] = text.parse_int(params.get("page"), 1)

        while True:
            page = self.request(url, params=params).text

            for album in text.extract_iter(
                    page, '<div class="relative">', '</div>'):
                yield {
                    "url"  : text.extr(album, ' href="', '"'),
                    "title": text.unquote(text.extr(
                        album, ' alt="', '"')),
                    "date" : self.parse_datetime_iso(text.extr(
                        album, ' datetime="', '"')),
                    "_extractor": AhottieGalleryExtractor,
                }

            if 'rel="next"' not in page:
                break
            params["page"] += 1


class AhottieGalleryExtractor(GalleryExtractor, AhottieExtractor):
    directory_fmt = ("{category}", "{date:%Y-%m-%d} {title} ({gallery_id})")
    filename_fmt = "{num:>03}.{extension}"
    archive_fmt = "{gallery_id}_{num}_{filename}"
    pattern = BASE_PATTERN + r"(/albums/(\w+))"
    example = "https://ahottie.top/albums/1234567890"

    def metadata(self, page):
        extr = text.extract_from(page)
        return {
            "gallery_id": self.groups[1],
            "title": text.unescape(extr("<title>", "<").rpartition(" | ")[0]),
            "date" : self.parse_datetime_iso(extr('datetime="', '"')),
            "tags" : text.split_html(extr('<i ', '</div>'))[1:],
        }

    def images(self, page):
        pos = page.find("<time ") + 1
        data = {
            "_http_headers" : {"Referer": None},
            "_http_validate": self._validate,
        }
        return [
            (url, data)
            for url in text.extract_iter(page, '" src="', '"', pos)
        ]

    def _validate(self, response):
        hget = response.headers.get
        return not (
            hget("content-length") == "2421" and
            hget("content-type") == "image/jpeg"
        )


class AhottieTagExtractor(AhottieExtractor):
    subcategory = "tag"
    pattern = BASE_PATTERN + r"/tags/([^/?#]+)"
    example = "https://ahottie.top/tags/TAG"

    def albums(self):
        tag = self.groups[0]
        self.kwdict["search_tags"] = text.unquote(tag)
        return self._pagination(f"{self.root}/tags/{tag}", {})


class AhottieSearchExtractor(AhottieExtractor):
    subcategory = "search"
    pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
    example = "https://ahottie.top/search?kw=QUERY"

    def albums(self):
        params = text.parse_query(self.groups[0])
        self.kwdict["search_tags"] = params.get("kw")
        return self._pagination(f"{self.root}/search", params)