diff options
| author | 2021-08-13 17:45:31 -0400 | |
|---|---|---|
| committer | 2021-08-13 17:45:31 -0400 | |
| commit | d50ba9cfe80f00e02ca9a4714f75699c00e67128 (patch) | |
| tree | 01fe7b46370d5068b8c692ae5ea95cab4d734bd8 /gallery_dl/extractor/bbc.py | |
| parent | 873d9a628e9412a79bdc64cd962470749de3425b (diff) | |
New upstream version 1.18.3.upstream/1.18.3
Diffstat (limited to 'gallery_dl/extractor/bbc.py')
| -rw-r--r-- | gallery_dl/extractor/bbc.py | 53 |
1 files changed, 39 insertions, 14 deletions
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index ace8a28..17b5f52 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -27,7 +27,7 @@ class BbcGalleryExtractor(GalleryExtractor): test = ( ("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", { "pattern": r"https://ichef\.bbci\.co\.uk" - r"/images/ic/976x549_b/\w+\.jpg", + r"/images/ic/1920xn/\w+\.jpg", "count": 37, "keyword": { "programme": "p084qtzs", @@ -49,32 +49,57 @@ class BbcGalleryExtractor(GalleryExtractor): } def images(self, page): + width = self.config("width") + width = width - width % 16 if width else 1920 + dimensions = "/{}xn/".format(width) + return [ - (imgset.rpartition(", ")[2].partition(" ")[0], None) - for imgset in text.extract_iter(page, 'data-image-src-sets="', '"') + (src.replace("/320x180_b/", dimensions), + {"_fallback": self._fallback_urls(src, width)}) + for src in text.extract_iter(page, 'data-image-src="', '"') ] + @staticmethod + def _fallback_urls(src, max_width): + front, _, back = src.partition("/320x180_b/") + for width in (1920, 1600, 1280, 976): + if width < max_width: + yield "{}/{}xn/{}".format(front, width, back) + class BbcProgrammeExtractor(Extractor): """Extractor for all galleries of a bbc programme""" category = "bbc" subcategory = "programme" root = "https://www.bbc.co.uk" - pattern = BASE_PATTERN + r"[^/?#]+/galleries)" - test = ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", { - "pattern": BbcGalleryExtractor.pattern, - "count": ">= 24", - }) + pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?" + test = ( + ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", { + "pattern": BbcGalleryExtractor.pattern, + "range": "1-50", + "count": ">= 50", + }), + ("https://www.bbc.co.uk/programmes/b006q2x0/galleries?page=40", { + "pattern": BbcGalleryExtractor.pattern, + "count": ">= 100", + }), + ) def __init__(self, match): Extractor.__init__(self, match) - self.galleries_url = self.root + match.group(1) + self.path, self.page = match.groups() def items(self): - page = self.request(self.galleries_url).text data = {"_extractor": BbcGalleryExtractor} + params = {"page": text.parse_int(self.page, 1)} + galleries_url = self.root + self.path - for programme_id in text.extract_iter( - page, '<a href="https://www.bbc.co.uk/programmes/', '"'): - url = "https://www.bbc.co.uk/programmes/" + programme_id - yield Message.Queue, url, data + while True: + page = self.request(galleries_url, params=params).text + for programme_id in text.extract_iter( + page, '<a href="https://www.bbc.co.uk/programmes/', '"'): + url = "https://www.bbc.co.uk/programmes/" + programme_id + yield Message.Queue, url, data + if 'rel="next"' not in page: + return + params["page"] += 1 |
