# -*- coding: utf-8 -*- # Copyright 2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for *booru sites""" from .common import Extractor, Message, generate_extractors from .. import text, util, exception from xml.etree import ElementTree import collections import re class BooruExtractor(Extractor): """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" page_start = 0 per_page = 100 def items(self): self.login() extended_tags = self.config("tags", False) data = self.metadata() for post in self.posts(): try: url = self._prepare_post(post, extended_tags) except KeyError: continue post.update(data) text.nameext_from_url(url, post) yield Message.Directory, post yield Message.Url, url, post def skip(self, num): pages = num // self.per_page self.page_start += pages return pages * self.per_page def login(self): """Login and set necessary cookies""" def metadata(self): """Return a dict with general metadata""" return () def posts(self): """Return an iterable with post objects""" return () def _prepare_post(self, post, extended_tags=False): url = post["file_url"] if url[0] == "/": url = self.root + url if extended_tags: self._fetch_extended_tags(post) post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") return url def _fetch_extended_tags(self, post, page=None): if not page: url = "{}/index.php?page=post&s=view&id={}".format( self.root, post["id"]) page = self.request(url).text html = text.extract(page, '