# -*- coding: utf-8 -*-
# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.bellazon.com/"""
from .common import Extractor, Message
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main"
class BellazonExtractor(Extractor):
"""Base class for bellazon extractors"""
category = "bellazon"
root = "https://www.bellazon.com/main"
directory_fmt = ("{category}", "{thread[section]}",
"{thread[title]} ({thread[id]})")
filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}"
archive_fmt = "{post[id]}/{filename}"
def items(self):
extract_urls = text.re(r']*?href="([^"]+)".*?)').findall
native = f"{self.root}/"
for post in self.posts():
urls = extract_urls(post["content"])
data = {"post": post}
post["count"] = data["count"] = len(urls)
yield Message.Directory, data
for data["num"], (info, url) in enumerate(urls, 1):
url = text.unescape(url)
if url.startswith(native):
if not (alt := text.extr(info, ' alt="', '"')) or (
alt.startswith("post-") and "_thumb." in alt):
name = url
else:
name = text.unescape(alt)
dc = text.nameext_from_url(name, data.copy())
dc["id"] = text.extr(info, 'data-fileid="', '"')
if ext := text.extr(info, 'data-fileext="', '"'):
dc["extension"] = ext
yield Message.Url, url, dc
else:
yield Message.Queue, url, data
def _pagination(self, base, pnum=None):
base = f"{self.root}{base}"
if pnum is None:
url = f"{base}/"
pnum = 1
else:
url = f"{base}/page/{pnum}/"
pnum = None
while True:
page = self.request(url).text
yield page
if pnum is None or ' rel="next" ' not in page or text.extr(
page, " rel=\"next\" data-page='", "'") == str(pnum):
return
pnum += 1
url = f"{base}/page/{pnum}/"
def _parse_thread(self, page):
schema = self._extract_jsonld(page)
author = schema["author"]
stats = schema["interactionStatistic"]
url_t = schema["url"]
url_a = author["url"]
path = text.split_html(text.extr(
page, '