# -*- coding: utf-8 -*- # Copyright 2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for *booru sites""" from .common import Extractor, Message, generate_extractors from .. import text, util, exception from xml.etree import ElementTree import collections import re class BooruExtractor(Extractor): """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" page_start = 0 per_page = 100 def items(self): self.login() extended_tags = self.config("tags", False) data = self.metadata() for post in self.posts(): try: url = self._prepare_post(post, extended_tags) except KeyError: continue post.update(data) text.nameext_from_url(url, post) yield Message.Directory, post yield Message.Url, url, post def skip(self, num): pages = num // self.per_page self.page_start += pages return pages * self.per_page def login(self): """Login and set necessary cookies""" def metadata(self): """Return a dict with general metadata""" return () def posts(self): """Return an iterable with post objects""" return () def _prepare_post(self, post, extended_tags=False): url = post["file_url"] if url[0] == "/": url = self.root + url if extended_tags: self._fetch_extended_tags(post) post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") return url def _fetch_extended_tags(self, post, page=None): if not page: url = "{}/index.php?page=post&s=view&id={}".format( self.root, post["id"]) page = self.request(url).text html = text.extract(page, '

Pool: ", "") if not name: raise exception.NotFoundError("pool") self.post_ids = text.extract_iter( page, 'class="thumb" id="p', '"', pos) return { "pool": text.parse_int(self.pool_id), "pool_name": text.unescape(name), } def posts(self): params = {} for params["id"] in util.advance(self.post_ids, self.page_start): for post in self._api_request(params): yield post.attrib EXTRACTORS = { "rule34": { "root": "https://rule34.xxx", "test-tag": ( ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { "content": "97e4bbf86c3860be18de384d02d544251afe1d45", "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", "count": 1, }), ), "test-pool": ( ("https://rule34.xxx/index.php?page=pool&s=show&id=179", { "count": 3, }), ), "test-post": ( ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { "content": "97e4bbf86c3860be18de384d02d544251afe1d45", "options": (("tags", True),), "keyword": { "tags_artist": "danraku", "tags_character": "kashima_(kantai_collection)", "tags_copyright": "kantai_collection", "tags_general": str, "tags_metadata": str, }, }), ), }, "safebooru": { "root": "https://safebooru.org", "test-tag": ( ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", { "url": "17c61b386530cf4c30842c9f580d15ef1cd09586", "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", }), ), "test-pool": ( ("https://safebooru.org/index.php?page=pool&s=show&id=11", { "count": 5, }), ), "test-post": ( ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", "content": "93b293b27dabd198afafabbaf87c49863ac82f27", "options": (("tags", True),), "keyword": { "tags_artist": "kawanakajima", "tags_character": "heath_ledger ronald_mcdonald the_joker", "tags_copyright": "dc_comics mcdonald's the_dark_knight", "tags_general": str, }, }), ), }, "realbooru": { "root": "https://realbooru.com", "test-tag": ( ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { "count": ">= 64", }), ), "test-pool": ( ("https://realbooru.com/index.php?page=pool&s=show&id=1", { "count": 3, }), ), "test-post": ( ("https://realbooru.com/index.php?page=post&s=view&id=668483", { "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", }), ), }, } generate_extractors(EXTRACTORS, globals(), ( BooruTagExtractor, BooruPoolExtractor, BooruPostExtractor, ))