gallery_dl/extractor/pictoa.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://pictoa.com/"""

from .common import Extractor, Message
from .. import text

BASE_PATTERN = r"(?:https?://)?(?:[\w]+\.)?pictoa\.com(?:\.de)?"


class PictoaExtractor(Extractor):
    """Base class for pictoa extractors"""
    category = "pictoa"
    root = "https://pictoa.com"
    directory_fmt = ("{category}", "{album_id} {album_title}")
    filename_fmt = "{id}.{extension}"
    archive_fmt = "{id}"


class PictoaImageExtractor(PictoaExtractor):
    """Extractor for single images from pictoa.com"""
    subcategory = "image"
    pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+)/(\d+)"
    example = "https://www.pictoa.com/albums/NAME-12345/12345.html"

    def items(self):
        album_id, image_id = self.groups

        url = "{}/albums/{}/{}.html".format(self.root, album_id, image_id)
        page = self.request(url).text
        album_title = text.extr(page, 'property="og:title" content="', '"')
        image_url = text.extr(page, 'property="og:image" content="', '"')

        data = {
            "album_id"   : album_id,
            "album_title": album_title.rpartition(" #")[0],
            "id"         : image_id,
            "url"        : image_url,
        }

        text.nameext_from_url(image_url, data)
        yield Message.Directory, data
        yield Message.Url, image_url, data


class PictoaAlbumExtractor(PictoaExtractor):
    """Extractor for image albums from pictoa.com"""
    subcategory = "album"
    pattern = BASE_PATTERN + r"/albums/(?:[\w-]+-)?(\d+).html"
    example = "https://www.pictoa.com/albums/NAME-12345.html"

    def items(self):
        album_id = self.groups[0]
        url = "{}/albums/{}.html".format(self.root, album_id)
        page = self.request(url).text

        album_data = {
            "album_id"   : album_id,
            "album_title": text.extr(page, "<h1>", "<"),
            "tags"       : text.split_html(text.extr(
                page, '<ol class="related-categories', '</ol>'))[1:],
            "_extractor" : PictoaImageExtractor,
        }

        while True:
            container = text.extr(page, '<main>', '<span id="flag" >')
            for url in text.extract_iter(
                    container, '<a rel="nofollow" href="', '"'):
                yield Message.Queue, url, album_data

            url = text.extr(page, '<link rel="next" href="', '"')
            if not url:
                break
            page = self.request(url).text