# -*- coding: utf-8 -*- # Copyright 2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for *booru sites""" from .common import Extractor, Message, generate_extractors from .. import text, util, exception from xml.etree import ElementTree import collections import operator import re class BooruExtractor(Extractor): """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" page_start = 0 per_page = 100 def items(self): self.login() data = self.metadata() tags = self.config("tags", False) for post in self.posts(): try: url = self._file_url(post) if url[0] == "/": url = self.root + url except (KeyError, TypeError): self.log.debug("Unable to fetch download URL for post %s " "(md5: %s)", post.get("id"), post.get("md5")) continue if tags: self._extended_tags(post) self._prepare(post) post.update(data) text.nameext_from_url(url, post) yield Message.Directory, post yield Message.Url, url, post def skip(self, num): pages = num // self.per_page self.page_start += pages return pages * self.per_page def login(self): """Login and set necessary cookies""" def metadata(self): """Return a dict with general metadata""" return () def posts(self): """Return an iterable with post objects""" return () _file_url = operator.itemgetter("file_url") @staticmethod def _prepare(post): post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") def _extended_tags(self, post, page=None): if not page: url = "{}/index.php?page=post&s=view&id={}".format( self.root, post["id"]) page = self.request(url).text html = text.extract(page, '