diff options
| author | 2021-08-04 02:14:44 -0400 | |
|---|---|---|
| committer | 2021-08-04 02:14:44 -0400 | |
| commit | 873d9a628e9412a79bdc64cd962470749de3425b (patch) | |
| tree | 8cd421ef79a9fa784147fa888543216f0872357b /gallery_dl/extractor/bbc.py | |
| parent | 32de2b06db501c7de81678bce8e3e0c3e63d340c (diff) | |
New upstream version 1.18.2.upstream/1.18.2
Diffstat (limited to 'gallery_dl/extractor/bbc.py')
| -rw-r--r-- | gallery_dl/extractor/bbc.py | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py new file mode 100644 index 0000000..ace8a28 --- /dev/null +++ b/gallery_dl/extractor/bbc.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://bbc.co.uk/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/" + + +class BbcGalleryExtractor(GalleryExtractor): + """Extractor for a programme gallery on bbc.co.uk""" + category = "bbc" + root = "https://www.bbc.co.uk" + directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}", + "{path[3:]:J - /}") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{programme}_{num}" + pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$" + test = ( + ("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", { + "pattern": r"https://ichef\.bbci\.co\.uk" + r"/images/ic/976x549_b/\w+\.jpg", + "count": 37, + "keyword": { + "programme": "p084qtzs", + "path": ["BBC One", "Doctor Who", "The Timeless Children"], + }, + }), + ("https://www.bbc.co.uk/programmes/p084qtzs"), + ) + + def metadata(self, page): + data = json.loads(text.extract( + page, '<script type="application/ld+json">', '</script>')[0]) + return { + "programme": self.gallery_url.split("/")[4], + "path": list(util.unique_sequence( + element["name"] + for element in data["itemListElement"] + )), + } + + def images(self, page): + return [ + (imgset.rpartition(", ")[2].partition(" ")[0], None) + for imgset in text.extract_iter(page, 'data-image-src-sets="', '"') + ] + + +class BbcProgrammeExtractor(Extractor): + """Extractor for all galleries of a bbc programme""" + category = "bbc" + subcategory = "programme" + root = "https://www.bbc.co.uk" + pattern = BASE_PATTERN + r"[^/?#]+/galleries)" + test = ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", { + "pattern": BbcGalleryExtractor.pattern, + "count": ">= 24", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.galleries_url = self.root + match.group(1) + + def items(self): + page = self.request(self.galleries_url).text + data = {"_extractor": BbcGalleryExtractor} + + for programme_id in text.extract_iter( + page, '<a href="https://www.bbc.co.uk/programmes/', '"'): + url = "https://www.bbc.co.uk/programmes/" + programme_id + yield Message.Queue, url, data |
