# -*- coding: utf-8 -*- # Copyright 2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.bellazon.com/""" from .common import Extractor, Message from .. import text, exception BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main" class BellazonExtractor(Extractor): """Base class for bellazon extractors""" category = "bellazon" root = "https://www.bellazon.com/main" directory_fmt = ("{category}", "{thread[section]}", "{thread[title]} ({thread[id]})") filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}" archive_fmt = "{post[id]}/{filename}" def items(self): extract_urls = text.re(r']*?href="([^"]+)".*?)').findall native = f"{self.root}/" for post in self.posts(): urls = extract_urls(post["content"]) data = {"post": post} post["count"] = data["count"] = len(urls) yield Message.Directory, data for data["num"], (info, url) in enumerate(urls, 1): url = text.unescape(url) if url.startswith(native): if not (alt := text.extr(info, ' alt="', '"')) or ( alt.startswith("post-") and "_thumb." in alt): name = url else: name = text.unescape(alt) dc = text.nameext_from_url(name, data.copy()) dc["id"] = text.extr(info, 'data-fileid="', '"') if ext := text.extr(info, 'data-fileext="', '"'): dc["extension"] = ext yield Message.Url, url, dc else: yield Message.Queue, url, data def _pagination(self, base, pnum=None): base = f"{self.root}{base}" if pnum is None: url = f"{base}/" pnum = 1 else: url = f"{base}/page/{pnum}/" pnum = None while True: page = self.request(url).text yield page if pnum is None or ' rel="next" ' not in page or text.extr( page, " rel=\"next\" data-page='", "'") == str(pnum): return pnum += 1 url = f"{base}/page/{pnum}/" def _parse_thread(self, page): schema = self._extract_jsonld(page) author = schema["author"] stats = schema["interactionStatistic"] url_t = schema["url"] url_a = author["url"] path = text.split_html(text.extr( page, '