# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.xasiat.com""" from .common import Extractor, Message from .. import text import time BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums" class XasiatExtractor(Extractor): category = "xasiat" directory_fmt = ("{category}", "{title}") archive_fmt = "{album_url}_{num}" root = "https://www.xasiat.com" def items(self): data = {"_extractor": XasiatAlbumExtractor} for url in self.posts(): yield Message.Queue, url, data def posts(self): return self._pagination(self.groups[0]) def _pagination(self, path, pnum=1): url = f"{self.root}{path}/" find_posts = text.re(r'class="item ">\s*Next" in page: return pnum += 1 class XasiatAlbumExtractor(XasiatExtractor): subcategory = "album" pattern = BASE_PATTERN + r"/(\d+)/[^/?#]+)" example = "https://www.xasiat.com/albums/12345/TITLE/" def items(self): path, album_id = self.groups url = f"{self.root}{path}/" response = self.request(url) extr = text.extract_from(response.text) title = extr("

\s(.+)\s\s(.+)\s\s(.+)\s

", "<") info = extr('class="info-content"', "") images = extr('class="images"', "") urls = list(text.extract_iter(images, 'href="', '"')) categories = text.re(r'categories/[^"]+\">\s*(.+)\s*\s*(.+)\s*\s*(.+)\s*