1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://tumblrgallery.xyz/"""
from .common import GalleryExtractor
from .. import text
BASE_PATTERN = r"(?:https?://)?tumblrgallery\.xyz"
class TumblrgalleryExtractor(GalleryExtractor):
"""Base class for tumblrgallery extractors"""
category = "tumblrgallery"
filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}"
directory_fmt = ("{category}", "{gallery_id} {title}")
root = "https://tumblrgallery.xyz"
@staticmethod
def _urls_from_page(page):
return text.extract_iter(
page, '<div class="report"> <a class="xx-co-me" href="', '"')
@staticmethod
def _data_from_url(url):
filename = text.nameext_from_url(url)["filename"]
parts = filename.split("_")
try:
return {"id": parts[1] if parts[1] != "inline" else parts[2]}
except IndexError:
return {"id": filename}
class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
"""Extractor for Tumblrblog on tumblrgallery.xyz"""
subcategory = "tumblrblog"
pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+)\.html)"
example = "https://tumblrgallery.xyz/tumblrblog/gallery/12345.html"
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
self.gallery_id = text.parse_int(match.group(2))
def metadata(self, page):
return {
"title" : text.unescape(text.extr(page, "<h1>", "</h1>")),
"gallery_id": self.gallery_id,
}
def images(self, _):
page_num = 1
while True:
url = "{}/tumblrblog/gallery/{}/{}.html".format(
self.root, self.gallery_id, page_num)
response = self.request(url, allow_redirects=False, fatal=False)
if response.status_code >= 300:
return
for url in self._urls_from_page(response.text):
yield url, self._data_from_url(url)
page_num += 1
class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
"""Extractor for Posts on tumblrgallery.xyz"""
subcategory = "post"
pattern = BASE_PATTERN + r"(/post/(\d+)\.html)"
example = "https://tumblrgallery.xyz/post/12345.html"
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
self.gallery_id = text.parse_int(match.group(2))
def metadata(self, page):
return {
"title" : text.remove_html(
text.unescape(text.extr(page, "<title>", "</title>"))
).replace("_", "-"),
"gallery_id": self.gallery_id,
}
def images(self, page):
for url in self._urls_from_page(page):
yield url, self._data_from_url(url)
class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
"""Extractor for Search result on tumblrgallery.xyz"""
subcategory = "search"
filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}"
directory_fmt = ("{category}", "{search_term}")
pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))"
example = "https://tumblrgallery.xyz/s.php?q=QUERY"
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
self.search_term = match.group(2)
def metadata(self, page):
return {
"search_term": self.search_term,
}
def images(self, _):
page_url = "s.php?q=" + self.search_term
while True:
page = self.request(self.root + "/" + page_url).text
for gallery_id in text.extract_iter(
page, '<div class="title"><a href="post/', '.html'):
url = "{}/post/{}.html".format(self.root, gallery_id)
post_page = self.request(url).text
for url in self._urls_from_page(post_page):
data = self._data_from_url(url)
data["gallery_id"] = gallery_id
data["title"] = text.remove_html(text.unescape(
text.extr(post_page, "<title>", "</title>")
)).replace("_", "-")
yield url, data
next_url = text.extr(
page, '</span> <a class="btn btn-primary" href="', '"')
if not next_url or page_url == next_url:
return
page_url = next_url
|