diff options
Diffstat (limited to 'gallery_dl/extractor/bunkr.py')
| -rw-r--r-- | gallery_dl/extractor/bunkr.py | 143 |
1 files changed, 108 insertions, 35 deletions
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 9022ffc..6c79d0a 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -8,9 +8,10 @@ """Extractors for https://bunkr.si/""" +from .common import Extractor from .lolisafe import LolisafeAlbumExtractor -from .. import text, config - +from .. import text, config, exception +import random if config.get(("extractor", "bunkr"), "tlds"): BASE_PATTERN = ( @@ -21,11 +22,28 @@ else: BASE_PATTERN = ( r"(?:bunkr:(?:https?://)?([^/?#]+)|" r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]" + r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]" r"|black|cat|media|red|site|ws|org)))" ) +DOMAINS = [ + "bunkr.ac", + "bunkr.ci", + "bunkr.fi", + "bunkr.ph", + "bunkr.pk", + "bunkr.ps", + "bunkr.si", + "bunkr.sk", + "bunkr.ws", + "bunkr.black", + "bunkr.red", + "bunkr.media", + "bunkr.site", +] LEGACY_DOMAINS = { + "bunkr.ax", + "bunkr.cat", "bunkr.ru", "bunkrr.ru", "bunkr.su", @@ -34,6 +52,7 @@ LEGACY_DOMAINS = { "bunkr.is", "bunkr.to", } +CF_DOMAINS = set() class BunkrAlbumExtractor(LolisafeAlbumExtractor): @@ -49,45 +68,96 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): if domain not in LEGACY_DOMAINS: self.root = "https://" + domain + def request(self, url, **kwargs): + kwargs["allow_redirects"] = False + + while True: + try: + response = Extractor.request(self, url, **kwargs) + if response.status_code < 300: + return response + + # redirect + url = response.headers["Location"] + root, path = self._split(url) + if root not in CF_DOMAINS: + continue + self.log.debug("Redirect to known CF challenge domain '%s'", + root) + + except exception.HttpError as exc: + if exc.status != 403: + raise + + # CF challenge + root, path = self._split(url) + CF_DOMAINS.add(root) + self.log.debug("Added '%s' to CF challenge domains", root) + + try: + DOMAINS.remove(root.rpartition("/")[2]) + except ValueError: + pass + else: + if not DOMAINS: + raise exception.StopExtraction( + "All Bunkr domains require solving a CF challenge") + + # select alternative domain + root = "https://" + random.choice(DOMAINS) + self.log.debug("Trying '%s' as fallback", root) + url = root + path + def fetch_album(self, album_id): # album metadata page = self.request(self.root + "/a/" + self.album_id).text - info = text.split_html(text.extr( - page, "<h1", "</div>").partition(">")[2]) - count, _, size = info[1].split(None, 2) + title, size = text.split_html(text.extr( + page, "<h1", "</span>").partition(">")[2]) - pos = page.index('class="grid-images') - urls = list(text.extract_iter(page, '<a href="', '"', pos)) - - return self._extract_files(urls), { + items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->")) + return self._extract_files(items), { "album_id" : self.album_id, - "album_name" : text.unescape(info[0]), - "album_size" : size[1:-1], - "count" : len(urls), - "_http_validate": self._validate, + "album_name" : title, + "album_size" : text.extr(size, "(", ")"), + "count" : len(items), } - def _extract_files(self, urls): - for url in urls: + def _extract_files(self, items): + for item in items: try: - url = self._extract_file(text.unescape(url)) + url = text.extr(item, ' href="', '"') + file = self._extract_file(text.unescape(url)) + + info = text.split_html(item) + file["name"] = info[0] + file["size"] = info[2] + file["date"] = text.parse_datetime( + info[-1], "%H:%M:%S %d/%m/%Y") + + yield file + except exception.StopExtraction: + raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) - continue - yield {"file": text.unescape(url)} - - def _extract_file(self, url): - page = self.request(url).text - url = (text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"')) - - if not url: - url_download = text.rextract( - page, ' href="', '"', page.rindex("Download"))[0] - page = self.request(text.unescape(url_download)).text - url = text.unescape(text.rextract(page, ' href="', '"')[0]) - - return url + self.log.debug("", exc_info=exc) + + def _extract_file(self, webpage_url): + response = self.request(webpage_url) + page = response.text + file_url = (text.extr(page, '<source src="', '"') or + text.extr(page, '<img src="', '"')) + + if not file_url: + webpage_url = text.unescape(text.rextract( + page, ' href="', '"', page.rindex("Download"))[0]) + response = self.request(webpage_url) + file_url = text.rextract(response.text, ' href="', '"')[0] + + return { + "file" : text.unescape(file_url), + "_http_headers" : {"Referer": response.url}, + "_http_validate": self._validate, + } def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): @@ -95,6 +165,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): return False return True + def _split(self, url): + pos = url.index("/", 8) + return url[:pos], url[pos:] + class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.si media links""" @@ -105,16 +179,15 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): def fetch_album(self, album_id): try: - url = self._extract_file(self.root + self.album_id) + file = self._extract_file(self.root + album_id) except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) return (), {} - return ({"file": text.unescape(url)},), { + return (file,), { "album_id" : "", "album_name" : "", "album_size" : -1, "description": "", "count" : 1, - "_http_validate": self._validate, } |
