diff options
Diffstat (limited to 'gallery_dl/extractor/booru.py')
| -rw-r--r-- | gallery_dl/extractor/booru.py | 381 |
1 files changed, 191 insertions, 190 deletions
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 0176d76..517df93 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,247 +1,248 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Base classes for extractors for danbooru and co""" +"""Extractors for *booru sites""" + +from .common import Extractor, Message, generate_extractors +from .. import text, util, exception -from .common import Extractor, Message, SharedConfigMixin -from .. import text, exception from xml.etree import ElementTree import collections -import datetime -import operator import re -class BooruExtractor(SharedConfigMixin, Extractor): - """Base class for all booru extractors""" +class BooruExtractor(Extractor): + """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" - api_url = "" - post_url = "" - per_page = 50 - page_start = 1 - page_limit = None - sort = False + page_start = 0 + per_page = 100 - def __init__(self, match): - super().__init__(match) - self.params = {} - self.extags = self.post_url and self.config("tags", False) + def items(self): + self.login() + extended_tags = self.config("tags", False) + data = self.metadata() + for post in self.posts(): + try: + url = self._prepare_post(post, extended_tags) + except KeyError: + continue + post.update(data) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def skip(self, num): pages = num // self.per_page - if self.page_limit and pages + self.page_start > self.page_limit: - pages = self.page_limit - self.page_start self.page_start += pages return pages * self.per_page - def items(self): - yield Message.Version, 1 - data = self.get_metadata() + def login(self): + """Login and set necessary cookies""" - self.reset_page() - while True: - images = self.parse_response( - self.request(self.api_url, params=self.params)) - - for image in images: - try: - url = self.get_file_url(image) - except KeyError: - continue - if url.startswith("/"): - url = text.urljoin(self.api_url, url) - image.update(data) - text.nameext_from_url(url, image) - if self.extags: - self.extended_tags(image) - yield Message.Directory, image - yield Message.Url, url, image - - if len(images) < self.per_page: - return - self.update_page(image) + def metadata(self): + """Return a dict with general metadata""" + return () - def reset_page(self): - """Initialize params to point to the first page""" - self.params["page"] = self.page_start + def posts(self): + """Return an iterable with post objects""" + return () - def update_page(self, data): - """Update params to point to the next page""" + def _prepare_post(self, post, extended_tags=False): + url = post["file_url"] + if url[0] == "/": + url = self.root + url + if extended_tags: + self._fetch_extended_tags(post) + post["date"] = text.parse_datetime( + post["created_at"], "%a %b %d %H:%M:%S %z %Y") + return url - def parse_response(self, response): - """Parse JSON API response""" - images = response.json() - if self.sort: - images.sort(key=operator.itemgetter("score", "id"), - reverse=True) - return images + def _fetch_extended_tags(self, post, page=None): + if not page: + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"]) + page = self.request(url).text + html = text.extract(page, '<ul id="tag-', '</ul>')[0] + if html: + tags = collections.defaultdict(list) + pattern = re.compile( + r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) + for tag_type, tag_name in pattern.findall(html): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + def _api_request(self, params): + url = self.root + "/index.php?page=dapi&s=post&q=index" + return ElementTree.fromstring(self.request(url, params=params).text) + + def _pagination(self, params): + params["pid"] = self.page_start + params["limit"] = self.per_page - def get_metadata(self): - """Collect metadata for extractor-job""" - return {} + while True: + root = self._api_request(params) + for post in root: + yield post.attrib - @staticmethod - def get_file_url(image): - return image["file_url"] + if len(root) < self.per_page: + return + params["pid"] += 1 - def extended_tags(self, image, page=None): - """Retrieve extended tag information""" - if not page: - url = self.post_url.format(image["id"]) - page = self.request(url).text - tags = collections.defaultdict(list) - tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0] - pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S) - for tag_type, tag_name in pattern.findall(tags_html or ""): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - image["tags_" + key] = " ".join(value) - - -class XmlParserMixin(): - """Mixin for XML based API responses""" - def parse_response(self, response): - root = ElementTree.fromstring(response.text) - return [post.attrib for post in root] - - -class MoebooruPageMixin(): - """Pagination for Moebooru and Danbooru v1""" - def update_page(self, data): - if self.page_limit: - self.params["page"] = None - self.params["before_id"] = data["id"] - else: - self.params["page"] += 1 - - -class GelbooruPageMixin(): - """Pagination for Gelbooru-like sites""" - page_start = 0 - def reset_page(self): - self.params["pid"] = self.page_start +class BooruPostExtractor(BooruExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern_fmt = r"/index\.php\?page=post&s=view&id=(\d+)" - def update_page(self, data): - self.params["pid"] += 1 + def __init__(self, match): + BooruExtractor.__init__(self, match) + self.post_id = match.group(1) + def posts(self): + return self._pagination({"id": self.post_id}) -class TagMixin(): - """Extraction of images based on search-tags""" + +class BooruTagExtractor(BooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" + pattern_fmt = r"/index\.php\?page=post&s=list&tags=([^&#]+)" def __init__(self, match): - super().__init__(match) - self.tags = text.unquote(match.group("tags").replace("+", " ")) - self.params["tags"] = self.tags - self.params["limit"] = self.per_page + BooruExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1).replace("+", " ")) - def get_metadata(self): + def metadata(self): return {"search_tags": self.tags} + def posts(self): + return self._pagination({"tags" : self.tags}) + -class PoolMixin(): - """Extraction of image-pools""" +class BooruPoolExtractor(BooruExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" + pattern_fmt = r"/index\.php\?page=pool&s=show&id=(\d+)" def __init__(self, match): - super().__init__(match) - self.pool = match.group("pool") - self.params["tags"] = "pool:" + self.pool - self.params["limit"] = self.per_page - - def get_metadata(self): - return {"pool": text.parse_int(self.pool)} + BooruExtractor.__init__(self, match) + self.pool_id = match.group(1) + self.post_ids = () + def skip(self, num): + self.page_start += num + return num -class GelbooruPoolMixin(PoolMixin): - """Image-pool extraction for Gelbooru-like sites""" - per_page = 1 + def metadata(self): + url = "{}/index.php?page=pool&s=show&id={}".format( + self.root, self.pool_id) + page = self.request(url).text - def get_metadata(self): - page = self.request(self.pool_url.format(self.pool)).text - name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>") - if not name: - name, pos = text.extract(page, "<h4>Pool: ", "</h4>") + name, pos = text.extract(page, "<h4>Pool: ", "</h4>") if not name: raise exception.NotFoundError("pool") - self.posts = list(text.extract_iter( - page, 'class="thumb" id="p', '"', pos)) + self.post_ids = text.extract_iter( + page, 'class="thumb" id="p', '"', pos) return { - "pool": text.parse_int(self.pool), + "pool": text.parse_int(self.pool_id), "pool_name": text.unescape(name), - "count": len(self.posts), } - def reset_page(self): - self.index = self.page_start - self.update_page(None) - - def update_page(self, data): - try: - post = self.posts[self.index] - self.index += 1 - except IndexError: - post = "0" - self.params["tags"] = "id:" + post - - -class PostMixin(): - """Extraction of a single image-post""" - subcategory = "post" - archive_fmt = "{id}" - - def __init__(self, match): - super().__init__(match) - self.post = match.group("post") - self.params["tags"] = "id:" + self.post - - -class MoebooruPopularMixin(): - """Extraction and metadata handling for Moebooru and Danbooru v1""" - subcategory = "popular" - directory_fmt = ("{category}", "popular", "{scale}", "{date}") - archive_fmt = "P_{scale[0]}_{date}_{id}" - page_start = None - sort = True - - def __init__(self, match): - super().__init__(match) - self.params.update(text.parse_query(match.group("query"))) - self.scale = match.group("scale") - - def get_metadata(self, fmt="%Y-%m-%d"): - date = self.get_date() or datetime.date.today().isoformat() - scale = self.get_scale() or "day" - - if scale == "week": - date = datetime.date.fromisoformat(date) - date = (date - datetime.timedelta(days=date.weekday())).isoformat() - elif scale == "month": - date = date[:-3] - - return {"date": date, "scale": scale} - - def get_date(self): - if "year" in self.params: - return "{:>04}-{:>02}-{:>02}".format( - self.params["year"], - self.params.get("month", "01"), - self.params.get("day", "01")) - return None - - def get_scale(self): - if self.scale and self.scale.startswith("by_"): - return self.scale[3:] - return self.scale + def posts(self): + params = {} + for params["id"] in util.advance(self.post_ids, self.page_start): + for post in self._api_request(params): + yield post.attrib + + +EXTRACTORS = { + "rule34": { + "root": "https://rule34.xxx", + "test-tag": ( + ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { + "content": "97e4bbf86c3860be18de384d02d544251afe1d45", + "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", + "count": 1, + }), + ), + "test-pool": ( + ("https://rule34.xxx/index.php?page=pool&s=show&id=179", { + "count": 3, + }), + ), + "test-post": ( + ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { + "content": "97e4bbf86c3860be18de384d02d544251afe1d45", + "options": (("tags", True),), + "keyword": { + "tags_artist": "danraku", + "tags_character": "kashima_(kantai_collection)", + "tags_copyright": "kantai_collection", + "tags_general": str, + "tags_metadata": str, + }, + }), + ), + }, + "safebooru": { + "root": "https://safebooru.org", + "test-tag": ( + ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", { + "url": "17c61b386530cf4c30842c9f580d15ef1cd09586", + "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", + }), + ), + "test-pool": ( + ("https://safebooru.org/index.php?page=pool&s=show&id=11", { + "count": 5, + }), + ), + "test-post": ( + ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { + "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", + "content": "93b293b27dabd198afafabbaf87c49863ac82f27", + "options": (("tags", True),), + "keyword": { + "tags_artist": "kawanakajima", + "tags_character": "heath_ledger ronald_mcdonald the_joker", + "tags_copyright": "dc_comics mcdonald's the_dark_knight", + "tags_general": str, + }, + }), + ), + }, + "realbooru": { + "root": "https://realbooru.com", + "test-tag": ( + ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { + "count": ">= 64", + }), + ), + "test-pool": ( + ("https://realbooru.com/index.php?page=pool&s=show&id=1", { + "count": 3, + }), + ), + "test-post": ( + ("https://realbooru.com/index.php?page=post&s=view&id=668483", { + "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", + "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", + }), + ), + }, +} + +generate_extractors(EXTRACTORS, globals(), ( + BooruTagExtractor, + BooruPoolExtractor, + BooruPostExtractor, +)) |
