gallery_dl/extractor/mememuseum.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

# -*- coding: utf-8 -*-

# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://meme.museum/"""

from .common import Extractor, Message
from .. import text


class MememuseumExtractor(Extractor):
    """Base class for meme.museum extractors"""
    basecategory = "booru"
    category = "mememuseum"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    archive_fmt = "{id}"
    root = "https://meme.museum"

    def items(self):
        data = self.metadata()

        for post in self.posts():
            url = post["file_url"]
            for key in ("id", "width", "height"):
                post[key] = text.parse_int(post[key])
            post["tags"] = text.unquote(post["tags"])
            post.update(data)
            yield Message.Directory, post
            yield Message.Url, url, text.nameext_from_url(url, post)

    def metadata(self):
        """Return general metadata"""
        return ()

    def posts(self):
        """Return an iterable containing data of all relevant posts"""
        return ()


class MememuseumTagExtractor(MememuseumExtractor):
    """Extractor for images from meme.museum by search-tags"""
    subcategory = "tag"
    directory_fmt = ("{category}", "{search_tags}")
    pattern = r"(?:https?://)?meme\.museum/post/list/([^/?#]+)"
    test = ("https://meme.museum/post/list/animated/1", {
        "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20",
        "count": ">= 30"
    })
    per_page = 25

    def __init__(self, match):
        MememuseumExtractor.__init__(self, match)
        self.tags = text.unquote(match.group(1))

    def metadata(self):
        return {"search_tags": self.tags}

    def posts(self):
        pnum = 1
        while True:
            url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
            extr = text.extract_from(self.request(url).text)

            while True:
                mime = extr("data-mime='", "'")
                if not mime:
                    break

                pid = extr("data-post-id='", "'")
                tags, dimensions, size = extr("title='", "'").split(" // ")
                md5 = extr("/_thumbs/", "/")
                width, _, height = dimensions.partition("x")

                yield {
                    "file_url": "{}/_images/{}/{}%20-%20{}.{}".format(
                        self.root, md5, pid, text.quote(tags),
                        mime.rpartition("/")[2]),
                    "id": pid, "md5": md5, "tags": tags,
                    "width": width, "height": height,
                    "size": text.parse_bytes(size[:-1]),
                }

            if not extr(">Next<", ">"):
                return
            pnum += 1


class MememuseumPostExtractor(MememuseumExtractor):
    """Extractor for single images from meme.museum"""
    subcategory = "post"
    pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)"
    test = ("https://meme.museum/post/view/10243", {
        "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997"
                   r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm"
                   r"an%20stallman%20tagme%20text\.jpg",
        "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e",
        "content": "45565f3f141fc960a8ae1168b80e718a494c52d2",
    })

    def __init__(self, match):
        MememuseumExtractor.__init__(self, match)
        self.post_id = match.group(1)

    def posts(self):
        url = "{}/post/view/{}".format(self.root, self.post_id)
        extr = text.extract_from(self.request(url).text)

        return ({
            "id"      : self.post_id,
            "tags"    : extr(": ", "<"),
            "md5"     : extr("/_thumbs/", "/"),
            "file_url": self.root + extr("id='main_image' src='", "'"),
            "width"   : extr("data-width=", " ").strip("'\""),
            "height"  : extr("data-height=", " ").strip("'\""),
            "size"    : 0,
        },)