diff options
| author | 2019-07-02 04:33:45 -0400 | |
|---|---|---|
| committer | 2019-07-02 04:33:45 -0400 | |
| commit | 195c45911e79c33cf0bb986721365fb06df5a153 (patch) | |
| tree | ac0c9b6ef40bea7aa7ab0c5c3cb500eb510668fa /gallery_dl/extractor/paheal.py | |
Import Upstream version 1.8.7upstream/1.8.7
Diffstat (limited to 'gallery_dl/extractor/paheal.py')
| -rw-r--r-- | gallery_dl/extractor/paheal.py | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py new file mode 100644 index 0000000..a4731d0 --- /dev/null +++ b/gallery_dl/extractor/paheal.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://rule34.paheal.net/""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text + + +class PahealExtractor(SharedConfigMixin, Extractor): + """Base class for paheal extractors""" + basecategory = "booru" + category = "paheal" + filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" + root = "https://rule34.paheal.net" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.get_metadata() + + for data in self.get_posts(): + url = data["file_url"] + for key in ("id", "width", "height"): + data[key] = text.parse_int(data[key]) + data["tags"] = text.unquote(data["tags"]) + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_metadata(self): + """Return general metadata""" + return {} + + def get_posts(self): + """Return an iterable containing data of all relevant posts""" + + +class PahealTagExtractor(PahealExtractor): + """Extractor for images from rule34.paheal.net by search-tags""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" + r"/post/list/([^/?&#]+)") + test = ("https://rule34.paheal.net/post/list/k-on/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }) + per_page = 70 + + def __init__(self, match): + PahealExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)) + + def get_metadata(self): + return {"search_tags": self.tags} + + def get_posts(self): + pnum = 1 + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + + for post in text.extract_iter( + page, '<img id="thumb_', '>Image Only<'): + yield self._extract_data(post) + + if ">Next<" not in page: + return + pnum += 1 + + @staticmethod + def _extract_data(post): + pid , pos = text.extract(post, '', '"') + data, pos = text.extract(post, 'title="', '"', pos) + md5 , pos = text.extract(post, '/_thumbs/', '/', pos) + url , pos = text.extract(post, '<a href="', '"', pos) + + tags, dimensions, size, _ = data.split(" // ") + width, _, height = dimensions.partition("x") + + return { + "id": pid, "md5": md5, "tags": tags, "file_url": url, + "width": width, "height": height, + "size": text.parse_bytes(size[:-1]), + } + + +class PahealPostExtractor(PahealExtractor): + """Extractor for single images from rule34.paheal.net""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" + r"/post/view/(\d+)") + test = ("https://rule34.paheal.net/post/view/481609", { + "url": "1142779378f655ec0497d4c301836aa667f788b1", + "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271", + "content": "7b924bcf150b352ac75c9d281d061e174c851a11", + }) + + def __init__(self, match): + PahealExtractor.__init__(self, match) + self.post_id = match.group(1) + + def get_posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + page = self.request(url).text + + tags , pos = text.extract(page, ": ", "<") + md5 , pos = text.extract(page, "/_thumbs/", "/", pos) + url , pos = text.extract(page, "id='main_image' src='", "'", pos) + width , pos = text.extract(page, "data-width='", "'", pos) + height, pos = text.extract(page, "data-height='", "'", pos) + + return ({ + "id": self.post_id, "md5": md5, "tags": tags, "file_url": url, + "width": width, "height": height, "size": 0, + },) |
