# -*- coding: utf-8 -*- # Copyright 2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://bbc.co.uk/""" from .common import GalleryExtractor, Extractor, Message from .. import text, util import json BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/" class BbcGalleryExtractor(GalleryExtractor): """Extractor for a programme gallery on bbc.co.uk""" category = "bbc" root = "https://www.bbc.co.uk" directory_fmt = ("{category}", "{path[0]}", "{path[1]}", "{path[2]}", "{path[3:]:J - /}") filename_fmt = "{num:>02}.{extension}" archive_fmt = "{programme}_{num}" pattern = BASE_PATTERN + r"[^/?#]+(?!/galleries)(?:/[^/?#]+)?)$" test = ( ("https://www.bbc.co.uk/programmes/p084qtzs/p085g9kg", { "pattern": r"https://ichef\.bbci\.co\.uk" r"/images/ic/976x549_b/\w+\.jpg", "count": 37, "keyword": { "programme": "p084qtzs", "path": ["BBC One", "Doctor Who", "The Timeless Children"], }, }), ("https://www.bbc.co.uk/programmes/p084qtzs"), ) def metadata(self, page): data = json.loads(text.extract( page, '')[0]) return { "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( element["name"] for element in data["itemListElement"] )), } def images(self, page): return [ (imgset.rpartition(", ")[2].partition(" ")[0], None) for imgset in text.extract_iter(page, 'data-image-src-sets="', '"') ] class BbcProgrammeExtractor(Extractor): """Extractor for all galleries of a bbc programme""" category = "bbc" subcategory = "programme" root = "https://www.bbc.co.uk" pattern = BASE_PATTERN + r"[^/?#]+/galleries)" test = ("https://www.bbc.co.uk/programmes/b006q2x0/galleries", { "pattern": BbcGalleryExtractor.pattern, "count": ">= 24", }) def __init__(self, match): Extractor.__init__(self, match) self.galleries_url = self.root + match.group(1) def items(self): page = self.request(self.galleries_url).text data = {"_extractor": BbcGalleryExtractor} for programme_id in text.extract_iter( page, '