diff options
| author | 2025-09-23 07:44:44 -0400 | |
|---|---|---|
| committer | 2025-09-23 07:44:44 -0400 | |
| commit | 291c04af647559317fc9f9f392ad43841ec509ad (patch) | |
| tree | 13a72906223927180001b362d086c82401cb7843 /gallery_dl/extractor/bellazon.py | |
| parent | 065386e00c7a6c8bbe4bb23a545a7fc7b2c09a4a (diff) | |
| parent | 42b62671fabfdcf983a9575221420d85f7fbcac1 (diff) | |
Update upstream source from tag 'upstream/1.30.8'
Update to upstream version '1.30.8'
with Debian dir 51367313d3355f7d0d16a754c5c63135fb3c72e2
Diffstat (limited to 'gallery_dl/extractor/bellazon.py')
| -rw-r--r-- | gallery_dl/extractor/bellazon.py | 85 |
1 files changed, 74 insertions, 11 deletions
diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py index 5c9b9cd..5dcb6a5 100644 --- a/gallery_dl/extractor/bellazon.py +++ b/gallery_dl/extractor/bellazon.py @@ -20,32 +20,61 @@ class BellazonExtractor(Extractor): root = "https://www.bellazon.com/main" directory_fmt = ("{category}", "{thread[section]}", "{thread[title]} ({thread[id]})") - filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}" - archive_fmt = "{post[id]}/{filename}" + filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}" + archive_fmt = "{post[id]}/{id}_{filename}" def items(self): - extract_urls = text.re(r'<a ([^>]*?href="([^"]+)".*?)</a>').findall - native = f"{self.root}/" + native = (f"{self.root}/", f"{self.root[6:]}/") + extract_urls = text.re( + r'(?s)<(' + r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>' + r'|img [^>]*?src="([^"]+)"[^>]*>' + r')' + ).findall + + if self.config("quoted", False): + strip_quoted = None + else: + strip_quoted = text.re(r"(?s)<blockquote .*?</blockquote>").sub for post in self.posts(): - urls = extract_urls(post["content"]) + if strip_quoted is None: + urls = extract_urls(post["content"]) + else: + urls = extract_urls(strip_quoted("", post["content"])) + data = {"post": post} post["count"] = data["count"] = len(urls) yield Message.Directory, data - for data["num"], (info, url) in enumerate(urls, 1): - url = text.unescape(url) + data["num"] = 0 + for info, url, url_img in urls: + url = text.unescape(url or url_img) + if url.startswith(native): + if "/uploads/emoticons/" in url or "/profile/" in url: + continue + data["num"] += 1 if not (alt := text.extr(info, ' alt="', '"')) or ( alt.startswith("post-") and "_thumb." in alt): name = url else: name = text.unescape(alt) + dc = text.nameext_from_url(name, data.copy()) dc["id"] = text.extr(info, 'data-fileid="', '"') if ext := text.extr(info, 'data-fileext="', '"'): dc["extension"] = ext + elif "/core/interface/file/attachment.php" in url: + if not dc["id"]: + dc["id"] = url.rpartition("?id=")[2] + if name := text.extr(info, ">", "<").strip(): + text.nameext_from_url(name, dc) + + if url[0] == "/": + url = f"https:{url}" yield Message.Url, url, dc + else: yield Message.Queue, url, data @@ -70,6 +99,28 @@ class BellazonExtractor(Extractor): pnum += 1 url = f"{base}/page/{pnum}/" + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page/9999/" # force redirect to highest page number + with self.request(url) as response: + parts = response.url.rsplit("/", 3) + pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1 + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page/{pnum}/" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request(url).text + def _parse_thread(self, page): schema = self._extract_jsonld(page) author = schema["author"] @@ -88,7 +139,7 @@ class BellazonExtractor(Extractor): "posts": stats[1]["userInteractionCount"], "date" : text.parse_datetime(schema["datePublished"]), "date_updated": text.parse_datetime(schema["dateModified"]), - "description" : text.unescape(schema["text"]), + "description" : text.unescape(schema["text"]).strip(), "section" : path[-2], "author" : author["name"], "author_url" : url_a, @@ -123,7 +174,7 @@ class BellazonExtractor(Extractor): class BellazonPostExtractor(BellazonExtractor): subcategory = "post" pattern = (rf"{BASE_PATTERN}(/topic/\d+-[\w-]+(?:/page/\d+)?)" - rf"/?#findComment-(\d+)") + rf"/?#(?:findC|c)omment-(\d+)") example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345" def posts(self): @@ -145,10 +196,22 @@ class BellazonThreadExtractor(BellazonExtractor): example = "https://www.bellazon.com/main/topic/123-SLUG/" def posts(self): - for page in self._pagination(*self.groups): + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(*self.groups) + reverse = False + else: + pages = self._pagination_reverse(*self.groups) + reverse = True + + for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) - for html in text.extract_iter(page, "<article ", "</article>"): + posts = text.extract_iter(page, "<article ", "</article>") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: yield self._parse_post(html) |
