summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/bbc.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2021-08-13 17:45:31 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2021-08-13 17:45:31 -0400
commitd50ba9cfe80f00e02ca9a4714f75699c00e67128 (patch)
tree01fe7b46370d5068b8c692ae5ea95cab4d734bd8 /gallery_dl/extractor/bbc.py
parent873d9a628e9412a79bdc64cd962470749de3425b (diff)
New upstream version 1.18.3.upstream/1.18.3
Diffstat (limited to 'gallery_dl/extractor/bbc.py')
-rw-r--r--gallery_dl/extractor/bbc.py53
1 files changed, 39 insertions, 14 deletions
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py
index ace8a28..17b5f52 100644
--- a/gallery_dl/extractor/bbc.py
+++ b/gallery_dl/extractor/bbc.py
@@ -27,7 +27,7 @@ class BbcGalleryExtractor(GalleryExtractor):
test = (
("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", {
"pattern": r"https://ichef\.bbci\.co\.uk"
- r"/images/ic/976x549_b/\w+\.jpg",
+ r"/images/ic/1920xn/\w+\.jpg",
"count": 37,
"keyword": {
"programme": "p084qtzs",
@@ -49,32 +49,57 @@ class BbcGalleryExtractor(GalleryExtractor):
}
def images(self, page):
+ width = self.config("width")
+ width = width - width % 16 if width else 1920
+ dimensions = "/{}xn/".format(width)
+
return [
- (imgset.rpartition(", ")[2].partition(" ")[0], None)
- for imgset in text.extract_iter(page, 'data-image-src-sets="', '"')
+ (src.replace("/320x180_b/", dimensions),
+ {"_fallback": self._fallback_urls(src, width)})
+ for src in text.extract_iter(page, 'data-image-src="', '"')
]
+ @staticmethod
+ def _fallback_urls(src, max_width):
+ front, _, back = src.partition("/320x180_b/")
+ for width in (1920, 1600, 1280, 976):
+ if width < max_width:
+ yield "{}/{}xn/{}".format(front, width, back)
+
class BbcProgrammeExtractor(Extractor):
"""Extractor for all galleries of a bbc programme"""
category = "bbc"
subcategory = "programme"
root = "https://www.bbc.co.uk"
- pattern = BASE_PATTERN + r"[^/?#]+/galleries)"
- test = ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", {
- "pattern": BbcGalleryExtractor.pattern,
- "count": ">= 24",
- })
+ pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"
+ test = (
+ ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", {
+ "pattern": BbcGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": ">= 50",
+ }),
+ ("https://www.bbc.co.uk/programmes/b006q2x0/galleries?page=40", {
+ "pattern": BbcGalleryExtractor.pattern,
+ "count": ">= 100",
+ }),
+ )
def __init__(self, match):
Extractor.__init__(self, match)
- self.galleries_url = self.root + match.group(1)
+ self.path, self.page = match.groups()
def items(self):
- page = self.request(self.galleries_url).text
data = {"_extractor": BbcGalleryExtractor}
+ params = {"page": text.parse_int(self.page, 1)}
+ galleries_url = self.root + self.path
- for programme_id in text.extract_iter(
- page, '<a href="https://www.bbc.co.uk/programmes/', '"'):
- url = "https://www.bbc.co.uk/programmes/" + programme_id
- yield Message.Queue, url, data
+ while True:
+ page = self.request(galleries_url, params=params).text
+ for programme_id in text.extract_iter(
+ page, '<a href="https://www.bbc.co.uk/programmes/', '"'):
+ url = "https://www.bbc.co.uk/programmes/" + programme_id
+ yield Message.Queue, url, data
+ if 'rel="next"' not in page:
+ return
+ params["page"] += 1