gallery_dl/extractor/4chanarchives.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

# -*- coding: utf-8 -*-

# Copyright 2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://4chanarchives.com/"""

from .common import Extractor, Message
from .. import text


class _4chanarchivesThreadExtractor(Extractor):
    """Extractor for threads on 4chanarchives.com"""
    category = "4chanarchives"
    subcategory = "thread"
    root = "https://4chanarchives.com"
    directory_fmt = ("{category}", "{board}", "{thread} - {title}")
    filename_fmt = "{no}-{filename}.{extension}"
    archive_fmt = "{board}_{thread}_{no}"
    referer = False
    pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)"
    example = "https://4chanarchives.com/board/a/thread/12345/"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.thread = match.groups()

    def items(self):
        url = "{}/board/{}/thread/{}".format(
            self.root, self.board, self.thread)
        page = self.request(url).text
        data = self.metadata(page)
        posts = self.posts(page)

        if not data["title"]:
            data["title"] = text.unescape(text.remove_html(
                posts[0]["com"]))[:50]

        for post in posts:
            post.update(data)
            yield Message.Directory, post
            if "url" in post:
                yield Message.Url, post["url"], post

    def metadata(self, page):
        return {
            "board"     : self.board,
            "thread"    : self.thread,
            "title"     : text.unescape(text.extr(
                page, 'property="og:title" content="', '"')),
        }

    def posts(self, page):
        """Build a list of all post objects"""
        return [self.parse(html) for html in text.extract_iter(
            page, 'id="pc', '</blockquote>')]

    def parse(self, html):
        """Build post object by extracting data from an HTML post"""
        post = self._extract_post(html)
        if ">File: <" in html:
            self._extract_file(html, post)
            post["extension"] = post["url"].rpartition(".")[2]
        return post

    @staticmethod
    def _extract_post(html):
        extr = text.extract_from(html)
        return {
            "no"  : text.parse_int(extr('', '"')),
            "name": extr('class="name">', '<'),
            "time": extr('class="dateTime postNum" >', '<').rstrip(),
            "com" : text.unescape(
                html[html.find('<blockquote'):].partition(">")[2]),
        }

    @staticmethod
    def _extract_file(html, post):
        extr = text.extract_from(html, html.index(">File: <"))
        post["url"] = extr('href="', '"')
        post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0])
        post["fsize"] = extr("(", ", ")
        post["w"] = text.parse_int(extr("", "x"))
        post["h"] = text.parse_int(extr("", ")"))


class _4chanarchivesBoardExtractor(Extractor):
    """Extractor for boards on 4chanarchives.com"""
    category = "4chanarchives"
    subcategory = "board"
    root = "https://4chanarchives.com"
    pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)(?:/(\d+))?/?$"
    example = "https://4chanarchives.com/board/a/"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.page = match.groups()

    def items(self):
        data = {"_extractor": _4chanarchivesThreadExtractor}
        pnum = text.parse_int(self.page, 1)
        needle = '''<span class="postNum desktop">
                        <span><a href="'''

        while True:
            url = "{}/board/{}/{}".format(self.root, self.board, pnum)
            page = self.request(url).text

            thread = None
            for thread in text.extract_iter(page, needle, '"'):
                yield Message.Queue, thread, data

            if thread is None:
                return
            pnum += 1