gallery_dl/extractor/xasiat.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.xasiat.com"""

from .common import Extractor, Message
from .. import text
import time

BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums"


class XasiatExtractor(Extractor):
    category = "xasiat"
    directory_fmt = ("{category}", "{title}")
    archive_fmt = "{album_url}_{num}"
    root = "https://www.xasiat.com"

    def items(self):
        data = {"_extractor": XasiatAlbumExtractor}
        for url in self.posts():
            yield Message.Queue, url, data

    def posts(self):
        return self._pagination(self.groups[0])

    def _pagination(self, path, pnum=1):
        url = f"{self.root}{path}/"
        find_posts = text.re(r'class="item  ">\s*<a href="([^"]+)').findall

        while True:
            params = {
                "mode": "async",
                "function": "get_block",
                "block_id": "list_albums_common_albums_list",
                "sort_by": "post_date",
                "from": pnum,
                "_": int(time.time() * 1000),
            }

            page = self.request(url, params=params).text
            yield from find_posts(page)

            if "<span>Next</span>" in page:
                return

            pnum += 1


class XasiatAlbumExtractor(XasiatExtractor):
    subcategory = "album"
    pattern = rf"{BASE_PATTERN}/(\d+)/[^/?#]+)"
    example = "https://www.xasiat.com/albums/12345/TITLE/"

    def items(self):
        path, album_id = self.groups
        url = f"{self.root}{path}/"
        response = self.request(url)
        extr = text.extract_from(response.text)

        title = extr("<h1>", "<")
        info = extr('class="info-content"', "</div>")
        images = extr('class="images"', "</div>")

        urls = list(text.extract_iter(images, 'href="', '"'))
        categories = text.re(r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)
        data = {
            "title": text.unescape(title),
            "model": text.re(
                r'top_models1"></i>\s*(.+)\s*</span').findall(info),
            "tags": text.re(
                r'tags/[^"]+\">\s*(.+)\s*</a').findall(info),
            "album_category": categories[0] if categories else "",
            "album_url": response.url,
            "album_id": text.parse_int(album_id),
            "count": len(urls),
        }

        yield Message.Directory, "", data
        for data["num"], url in enumerate(urls, 1):
            yield Message.Url, url, text.nameext_from_url(url[:-1], data)


class XasiatTagExtractor(XasiatExtractor):
    subcategory = "tag"
    pattern = rf"{BASE_PATTERN}/tags/[^/?#]+)"
    example = "https://www.xasiat.com/albums/tags/TAG/"


class XasiatCategoryExtractor(XasiatExtractor):
    subcategory = "category"
    pattern = rf"{BASE_PATTERN}/categories/[^/?#]+)"
    example = "https://www.xasiat.com/albums/categories/CATEGORY/"


class XasiatModelExtractor(XasiatExtractor):
    subcategory = "model"
    pattern = rf"{BASE_PATTERN}/models/[^/?#]+)"
    example = "https://www.xasiat.com/albums/models/MODEL/"