gallery_dl/extractor/vichan.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

# -*- coding: utf-8 -*-

# Copyright 2022-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for vichan imageboards"""

from .common import BaseExtractor, Message
from .. import text


class VichanExtractor(BaseExtractor):
    """Base class for vichan extractors"""
    basecategory = "vichan"


BASE_PATTERN = VichanExtractor.update({
    "8kun": {
        "root": "https://8kun.top",
        "pattern": r"8kun\.top",
    },
    "smugloli": {
        "root": None,
        "pattern": r"smuglo(?:\.li|li\.net)",
    },
    "gurochan": {
        "root": "https://boards.guro.cx",
        "pattern": r"boards\.guro\.cx",
    },
})


class VichanThreadExtractor(VichanExtractor):
    """Extractor for vichan threads"""
    subcategory = "thread"
    directory_fmt = ("{category}", "{board}", "{thread} {title}")
    filename_fmt = "{time}{num:?-//} {filename}.{extension}"
    archive_fmt = "{board}_{thread}_{tim}"
    pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
    example = "https://8kun.top/a/res/12345.html"

    def items(self):
        board = self.groups[-2]
        thread = self.groups[-1]
        url = f"{self.root}/{board}/res/{thread}.json"
        posts = self.request_json(url)["posts"]

        title = posts[0].get("sub") or text.remove_html(posts[0]["com"])
        process = (self._process_8kun if self.category == "8kun" else
                   self._process)
        data = {
            "board" : board,
            "thread": thread,
            "title" : text.unescape(title)[:50],
            "num"   : 0,
        }

        yield Message.Directory, "", data
        for post in posts:
            if "filename" in post:
                yield process(post, data)
                if "extra_files" in post:
                    for post["num"], filedata in enumerate(
                            post["extra_files"], 1):
                        yield process(post, filedata)

    def _process(self, post, data):
        post.update(data)
        ext = post["ext"]
        post["extension"] = ext[1:]
        post["url"] = url = \
            f"{self.root}/{post['board']}/src/{post['tim']}{ext}"
        return Message.Url, url, post

    def _process_8kun(self, post, data):
        post.update(data)
        ext = post["ext"]
        tim = post["tim"]

        if len(tim) > 16:
            url = f"https://media.128ducks.com/file_store/{tim}{ext}"
        else:
            url = f"https://media.128ducks.com/{post['board']}/src/{tim}{ext}"

        post["url"] = url
        post["extension"] = ext[1:]
        return Message.Url, url, post


class VichanBoardExtractor(VichanExtractor):
    """Extractor for vichan boards"""
    subcategory = "board"
    pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
    example = "https://8kun.top/a/"

    def items(self):
        board = self.groups[-1]
        url = f"{self.root}/{board}/threads.json"
        threads = self.request_json(url)

        for page in threads:
            for thread in page["threads"]:
                url = f"{self.root}/{board}/res/{thread['no']}.html"
                thread["page"] = page["page"]
                thread["_extractor"] = VichanThreadExtractor
                yield Message.Queue, url, thread