# -*- coding: utf-8 -*- # Copyright 2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.bellazon.com/""" from .common import Extractor, Message from .. import text, exception BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main" class BellazonExtractor(Extractor): """Base class for bellazon extractors""" category = "bellazon" root = "https://www.bellazon.com/main" directory_fmt = ("{category}", "{thread[section]}", "{thread[title]} ({thread[id]})") filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}" archive_fmt = "{post[id]}/{id}_{filename}" def items(self): native = (self.root + "/", self.root[6:] + "/") extract_urls = text.re( r'(?s)<(' r'(?:video .*?]*?src|a [^>]*?href)="([^"]+).*?' r'|img [^>]*?src="([^"]+)"[^>]*>' r')' ).findall if self.config("quoted", False): strip_quoted = None else: strip_quoted = text.re(r"(?s)

").sub for post in self.posts(): if strip_quoted is None: urls = extract_urls(post["content"]) else: urls = extract_urls(strip_quoted("", post["content"])) data = {"post": post} post["count"] = data["count"] = len(urls) yield Message.Directory, "", data data["num"] = data["num_internal"] = data["num_external"] = 0 for info, url, url_img in urls: url = text.unescape(url or url_img) if url.startswith(native): if ( "/uploads/emoticons/" in url or "/profile/" in url or "/topic/" in url ): continue data["num"] += 1 data["num_internal"] += 1 if not (alt := text.extr(info, ' alt="', '"')) or ( alt.startswith("post-") and "_thumb." in alt): dc = text.nameext_from_url(url, data.copy()) else: dc = data.copy() dc["name"] = name = text.unescape(alt) dc["filename"] = name.partition(".")[0] dc["id"] = text.extr(info, 'data-fileid="', '"') if ext := text.extr(info, 'data-fileext="', '"'): dc["extension"] = ext elif "/core/interface/file/attachment.php" in url: if not dc["id"]: dc["id"] = \ url.rpartition("?id=")[2].partition("&")[0] if name := text.extr(info, ">", "<").strip(): dc["name"] = name = text.unescape(name) text.nameext_from_name(name, dc) else: dc["extension"] = text.ext_from_url(url) if url[0] == "/": url = "https:" + url yield Message.Url, url, dc else: data["num"] += 1 data["num_external"] += 1 yield Message.Queue, url, data def _pagination(self, base, pnum=None): base = self.root + base if pnum is None: url = base + "/" pnum = 1 else: url = f"{base}/page/{pnum}/" pnum = None while True: page = self.request(url).text yield page if pnum is None or ' rel="next" ' not in page or text.extr( page, " rel=\"next\" data-page='", "'") == str(pnum): return pnum += 1 url = f"{base}/page/{pnum}/" def _pagination_reverse(self, base, pnum=None): base = self.root + base url = f"{base}/page/{'9999' if pnum is None else pnum}/" with self.request(url) as response: parts = response.url.rsplit("/", 3) pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1 page = response.text while True: yield page pnum -= 1 if pnum > 1: url = f"{base}/page/{pnum}/" elif pnum == 1: url = base + "/" else: return page = self.request(url).text def _parse_thread(self, page): schema = self._extract_jsonld(page) author = schema["author"] stats = schema["interactionStatistic"] url_t = schema["url"] url_a = author.get("url") or "" path = text.split_html(text.extr( page, '
"))[2:-1] thread = { "url" : url_t, "path" : path, "title": schema["headline"], "views": stats[0]["userInteractionCount"], "posts": stats[1]["userInteractionCount"], "date" : self.parse_datetime_iso(schema["datePublished"]), "date_updated": self.parse_datetime_iso(schema["dateModified"]), "description" : text.unescape(schema["text"]).strip(), "section" : path[-2], "author" : author["name"], "author_url" : url_a, } thread["id"], _, thread["slug"] = \ url_t.rsplit("/", 2)[1].partition("-") if url_a: thread["author_id"], _, thread["author_slug"] = \ url_a.rsplit("/", 2)[1].partition("-") else: thread["author_id"] = thread["author_slug"] = "" return thread def _parse_post(self, html): extr = text.extract_from(html) post = { "id": extr('id="elComment_', '"'), "author_url": extr(" href='", "'"), "date": self.parse_datetime_iso(extr("datetime='", "'")), "content": extr("", "\n\t\t"), } if (pos := post["content"].find(">")) >= 0: post["content"] = post["content"][pos+1:].strip() if url_a := post["author_url"]: post["author_id"], _, post["author_slug"] = \ url_a.rsplit("/", 2)[1].partition("-") else: post["author_id"] = post["author_slug"] = "" return post class BellazonPostExtractor(BellazonExtractor): subcategory = "post" pattern = (BASE_PATTERN + r"(/topic/\d+-[\w-]+(?:/page/\d+)?)" r"/?#(?:findC|c)omment-(\d+)") example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345" def posts(self): path, post_id = self.groups page = self.request(self.root + path).text pos = page.find('id="elComment_' + post_id) if pos < 0: raise exception.NotFoundError("post") html = text.extract(page, "
", pos-100)[0] self.kwdict["thread"] = self._parse_thread(page) return (self._parse_post(html),) class BellazonThreadExtractor(BellazonExtractor): subcategory = "thread" pattern = BASE_PATTERN + r"(/topic/\d+-[\w-]+)(?:/page/(\d+))?" example = "https://www.bellazon.com/main/topic/123-SLUG/" def posts(self): if (order := self.config("order-posts")) and \ order[0] not in ("d", "r"): pages = self._pagination(*self.groups) reverse = False else: pages = self._pagination_reverse(*self.groups) reverse = True for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) posts = text.extract_iter(page, "
") if reverse: posts = list(posts) posts.reverse() for html in posts: yield self._parse_post(html) class BellazonForumExtractor(BellazonExtractor): subcategory = "forum" pattern = BASE_PATTERN + r"(/forum/\d+-[\w-]+)(?:/page/(\d+))?" example = "https://www.bellazon.com/main/forum/123-SLUG/" def items(self): data = {"_extractor": BellazonThreadExtractor} for page in self._pagination(*self.groups): for row in text.extract_iter( page, '