aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/bunkr.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/bunkr.py')
-rw-r--r--gallery_dl/extractor/bunkr.py143
1 files changed, 108 insertions, 35 deletions
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 9022ffc..6c79d0a 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -8,9 +8,10 @@
"""Extractors for https://bunkr.si/"""
+from .common import Extractor
from .lolisafe import LolisafeAlbumExtractor
-from .. import text, config
-
+from .. import text, config, exception
+import random
if config.get(("extractor", "bunkr"), "tlds"):
BASE_PATTERN = (
@@ -21,11 +22,28 @@ else:
BASE_PATTERN = (
r"(?:bunkr:(?:https?://)?([^/?#]+)|"
r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]"
+ r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]"
r"|black|cat|media|red|site|ws|org)))"
)
+DOMAINS = [
+ "bunkr.ac",
+ "bunkr.ci",
+ "bunkr.fi",
+ "bunkr.ph",
+ "bunkr.pk",
+ "bunkr.ps",
+ "bunkr.si",
+ "bunkr.sk",
+ "bunkr.ws",
+ "bunkr.black",
+ "bunkr.red",
+ "bunkr.media",
+ "bunkr.site",
+]
LEGACY_DOMAINS = {
+ "bunkr.ax",
+ "bunkr.cat",
"bunkr.ru",
"bunkrr.ru",
"bunkr.su",
@@ -34,6 +52,7 @@ LEGACY_DOMAINS = {
"bunkr.is",
"bunkr.to",
}
+CF_DOMAINS = set()
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
@@ -49,45 +68,96 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
if domain not in LEGACY_DOMAINS:
self.root = "https://" + domain
+ def request(self, url, **kwargs):
+ kwargs["allow_redirects"] = False
+
+ while True:
+ try:
+ response = Extractor.request(self, url, **kwargs)
+ if response.status_code < 300:
+ return response
+
+ # redirect
+ url = response.headers["Location"]
+ root, path = self._split(url)
+ if root not in CF_DOMAINS:
+ continue
+ self.log.debug("Redirect to known CF challenge domain '%s'",
+ root)
+
+ except exception.HttpError as exc:
+ if exc.status != 403:
+ raise
+
+ # CF challenge
+ root, path = self._split(url)
+ CF_DOMAINS.add(root)
+ self.log.debug("Added '%s' to CF challenge domains", root)
+
+ try:
+ DOMAINS.remove(root.rpartition("/")[2])
+ except ValueError:
+ pass
+ else:
+ if not DOMAINS:
+ raise exception.StopExtraction(
+ "All Bunkr domains require solving a CF challenge")
+
+ # select alternative domain
+ root = "https://" + random.choice(DOMAINS)
+ self.log.debug("Trying '%s' as fallback", root)
+ url = root + path
+
def fetch_album(self, album_id):
# album metadata
page = self.request(self.root + "/a/" + self.album_id).text
- info = text.split_html(text.extr(
- page, "<h1", "</div>").partition(">")[2])
- count, _, size = info[1].split(None, 2)
+ title, size = text.split_html(text.extr(
+ page, "<h1", "</span>").partition(">")[2])
- pos = page.index('class="grid-images')
- urls = list(text.extract_iter(page, '<a href="', '"', pos))
-
- return self._extract_files(urls), {
+ items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->"))
+ return self._extract_files(items), {
"album_id" : self.album_id,
- "album_name" : text.unescape(info[0]),
- "album_size" : size[1:-1],
- "count" : len(urls),
- "_http_validate": self._validate,
+ "album_name" : title,
+ "album_size" : text.extr(size, "(", ")"),
+ "count" : len(items),
}
- def _extract_files(self, urls):
- for url in urls:
+ def _extract_files(self, items):
+ for item in items:
try:
- url = self._extract_file(text.unescape(url))
+ url = text.extr(item, ' href="', '"')
+ file = self._extract_file(text.unescape(url))
+
+ info = text.split_html(item)
+ file["name"] = info[0]
+ file["size"] = info[2]
+ file["date"] = text.parse_datetime(
+ info[-1], "%H:%M:%S %d/%m/%Y")
+
+ yield file
+ except exception.StopExtraction:
+ raise
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
- continue
- yield {"file": text.unescape(url)}
-
- def _extract_file(self, url):
- page = self.request(url).text
- url = (text.extr(page, '<source src="', '"') or
- text.extr(page, '<img src="', '"'))
-
- if not url:
- url_download = text.rextract(
- page, ' href="', '"', page.rindex("Download"))[0]
- page = self.request(text.unescape(url_download)).text
- url = text.unescape(text.rextract(page, ' href="', '"')[0])
-
- return url
+ self.log.debug("", exc_info=exc)
+
+ def _extract_file(self, webpage_url):
+ response = self.request(webpage_url)
+ page = response.text
+ file_url = (text.extr(page, '<source src="', '"') or
+ text.extr(page, '<img src="', '"'))
+
+ if not file_url:
+ webpage_url = text.unescape(text.rextract(
+ page, ' href="', '"', page.rindex("Download"))[0])
+ response = self.request(webpage_url)
+ file_url = text.rextract(response.text, ' href="', '"')[0]
+
+ return {
+ "file" : text.unescape(file_url),
+ "_http_headers" : {"Referer": response.url},
+ "_http_validate": self._validate,
+ }
def _validate(self, response):
if response.history and response.url.endswith("/maintenance-vid.mp4"):
@@ -95,6 +165,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
return False
return True
+ def _split(self, url):
+ pos = url.index("/", 8)
+ return url[:pos], url[pos:]
+
class BunkrMediaExtractor(BunkrAlbumExtractor):
"""Extractor for bunkr.si media links"""
@@ -105,16 +179,15 @@ class BunkrMediaExtractor(BunkrAlbumExtractor):
def fetch_album(self, album_id):
try:
- url = self._extract_file(self.root + self.album_id)
+ file = self._extract_file(self.root + album_id)
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
- return ({"file": text.unescape(url)},), {
+ return (file,), {
"album_id" : "",
"album_name" : "",
"album_size" : -1,
"description": "",
"count" : 1,
- "_http_validate": self._validate,
}