diff options
| author | 2025-07-31 01:22:01 -0400 | |
|---|---|---|
| committer | 2025-07-31 01:22:01 -0400 | |
| commit | a6e995c093de8aae2e91a0787281bb34c0b871eb (patch) | |
| tree | 2d79821b05300d34d8871eb6c9662b359a2de85d /gallery_dl/extractor/redbust.py | |
| parent | 7672a750cb74bf31e21d76aad2776367fd476155 (diff) | |
New upstream version 1.30.2.upstream/1.30.2
Diffstat (limited to 'gallery_dl/extractor/redbust.py')
| -rw-r--r-- | gallery_dl/extractor/redbust.py | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/gallery_dl/extractor/redbust.py b/gallery_dl/extractor/redbust.py new file mode 100644 index 0000000..d00ed52 --- /dev/null +++ b/gallery_dl/extractor/redbust.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://redbust.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?redbust\.com" + + +class RedbustExtractor(Extractor): + """Base class for RedBust extractors""" + category = "redbust" + root = "https://redbust.com" + filename_fmt = "{filename}.{extension}" + + def items(self): + data = {"_extractor": RedbustGalleryExtractor} + for url in self.galleries(): + yield Message.Queue, url, data + + def _pagination(self, path, page=None): + if page is None: + url = f"{self.root}{path}/" + base = url + "page/" + page = self.request(url).text + else: + base = f"{self.root}{path}/page/" + + pnum = 1 + while True: + for post in text.extract_iter( + page, '<h2 class="post-title">', "rel="): + yield text.extr(post, 'href="', '"') + + pnum += 1 + url = f"{base}{pnum}/" + if url not in page: + return + page = self.request(url).text + + +class RedbustGalleryExtractor(GalleryExtractor, RedbustExtractor): + """Extractor for RedBust galleries""" + pattern = BASE_PATTERN + r"/([\w-]+)/?$" + example = "https://redbust.com/TITLE/" + + def items(self): + url = f"{self.root}/{self.groups[0]}/" + self.page = page = self.request(url).text + + self.gallery_id = gid = text.extr( + page, "<link rel='shortlink' href='https://redbust.com/?p=", "'") + + if gid: + self.page_url = False + return GalleryExtractor.items(self) + else: + self.subcategory = "category" + return self._items_category(page) + + def _items_category(self, _): + page = self.page + data = {"_extractor": RedbustGalleryExtractor} + base = f"{self.root}/{self.groups[0]}/page/" + pnum = 1 + + while True: + for post in text.extract_iter( + page, '<h2 class="post-title">', "rel="): + url = text.extr(post, 'href="', '"') + yield Message.Queue, url, data + + pnum += 1 + url = f"{base}{pnum}/" + if url not in page: + return + page = self.request(url).text + + def metadata(self, _): + extr = text.extract_from(self.page) + + return { + "gallery_id" : self.gallery_id, + "gallery_slug": self.groups[0], + "categories" : text.split_html(extr( + '<li class="category">', "</li>"))[::2], + "title" : text.unescape(extr('class="post-title">', "<")), + "date" : text.parse_datetime( + extr('class="post-byline">', "<").strip(), "%B %d, %Y"), + "views" : text.parse_int(extr("</b>", "v").replace(",", "")), + "tags" : text.split_html(extr( + 'class="post-tags">', "</p"))[1:], + } + + def images(self, _): + results = [] + + for img in text.extract_iter(self.page, "'><img ", ">"): + if src := text.extr(img, 'src="', '"'): + path, _, end = src.rpartition("-") + if "x" in end: + url = f"{path}.{end.rpartition('.')[2]}" + data = None if src == url else {"_fallback": (src,)} + else: + url = src + data = None + results.append((url, data)) + + if not results: + # fallback for older galleries + for path in text.extract_iter( + self.page, '<img src="/wp-content/uploads/', '"'): + results.append( + (f"{self.root}/wp-content/uploads/{path}", None)) + + return results + + +class RedbustTagExtractor(RedbustExtractor): + """Extractor for RedBust tag searches""" + subcategory = "tag" + pattern = BASE_PATTERN + r"/tag/([\w-]+)" + example = "https://redbust.com/tag/TAG/" + + def galleries(self): + return self._pagination("/tag/" + self.groups[0]) + + +class RedbustArchiveExtractor(RedbustExtractor): + """Extractor for RedBust monthly archive collections""" + subcategory = "archive" + pattern = BASE_PATTERN + r"(/\d{4}/\d{2})" + example = "https://redbust.com/2010/01/" + + def galleries(self): + return self._pagination(self.groups[0]) + + +class RedbustImageExtractor(RedbustExtractor): + """Extractor for RedBust images""" + subcategory = "image" + directory_fmt = ("{category}", "{title}") + pattern = BASE_PATTERN + r"/(?!tag/|\d{4}/)([\w-]+)/([\w-]+)/?$" + example = "https://redbust.com/TITLE/SLUG/" + + def items(self): + gallery_slug, image_slug = self.groups + url = f"{self.root}/{gallery_slug}/{image_slug}/" + page = self.request(url).text + + img_url = None + + # Look for the largest image in srcset first + if srcset := text.extr(page, 'srcset="', '"'): + # Extract the largest image from srcset (typically last one) + urls = srcset.split(", ") + img_url = urls[-1].partition(" ")[0] if urls else None + + # Fallback to original extraction method + if not img_url: + if entry := text.extr(page, "entry-inner ", "alt="): + img_url = text.extr(entry, "img src=", " ").strip("\"'") + + if not img_url: + return + + end = img_url.rpartition("-")[2] + data = text.nameext_from_url(img_url, { + "title" : text.unescape(text.extr( + page, 'title="Return to ', '"')), + "image_id" : text.extr( + page, "rel='shortlink' href='https://redbust.com/?p=", "'"), + "gallery_slug": gallery_slug, + "image_slug" : image_slug, + "num" : text.parse_int(end.partition(".")[0]), + "count" : 1, + "url" : img_url, + }) + + yield Message.Directory, data + yield Message.Url, img_url, data |
