diff options
Diffstat (limited to 'nikola/metadata_extractors.py')
| -rw-r--r-- | nikola/metadata_extractors.py | 274 |
1 files changed, 274 insertions, 0 deletions
diff --git a/nikola/metadata_extractors.py b/nikola/metadata_extractors.py new file mode 100644 index 0000000..2377dc2 --- /dev/null +++ b/nikola/metadata_extractors.py @@ -0,0 +1,274 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2012-2020 Chris Warrick, Roberto Alsina and others. + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Default metadata extractors and helper functions.""" + +import re +from enum import Enum +from io import StringIO + +import natsort + +from nikola.plugin_categories import MetadataExtractor +from nikola.utils import unslugify + +__all__ = ('MetaCondition', 'MetaPriority', 'MetaSource', 'check_conditions') +_default_extractors = [] +DEFAULT_EXTRACTOR_NAME = 'nikola' +DEFAULT_EXTRACTOR = None + + +class MetaCondition(Enum): + """Conditions for extracting metadata.""" + + config_bool = 1 + config_present = 2 + extension = 3 + compiler = 4 + first_line = 5 + never = -1 + + +class MetaPriority(Enum): + """Priority of metadata. + + An extractor is used if and only if the higher-priority extractors returned nothing. + """ + + override = 1 + specialized = 2 + normal = 3 + fallback = 4 + + +class MetaSource(Enum): + """Source of metadata.""" + + text = 1 + filename = 2 + + +def check_conditions(post, filename: str, conditions: list, config: dict, source_text: str) -> bool: + """Check the conditions for a metadata extractor.""" + for ct, arg in conditions: + if any(( + ct == MetaCondition.config_bool and not config.get(arg, False), + ct == MetaCondition.config_present and arg not in config, + ct == MetaCondition.extension and not filename.endswith(arg), + ct == MetaCondition.compiler and (post is None or post.compiler.name != arg), + ct == MetaCondition.never + )): + return False + elif ct == MetaCondition.first_line: + if not source_text or not source_text.startswith(arg + '\n'): + return False + return True + + +def classify_extractor(extractor: MetadataExtractor, metadata_extractors_by: dict): + """Classify an extractor and add it to the metadata_extractors_by dict.""" + global DEFAULT_EXTRACTOR + if extractor.name == DEFAULT_EXTRACTOR_NAME: + DEFAULT_EXTRACTOR = extractor + metadata_extractors_by['priority'][extractor.priority].append(extractor) + metadata_extractors_by['source'][extractor.source].append(extractor) + metadata_extractors_by['name'][extractor.name] = extractor + metadata_extractors_by['all'].append(extractor) + + +def load_defaults(site, metadata_extractors_by: dict): + """Load default metadata extractors.""" + for extractor in _default_extractors: + extractor.site = site + classify_extractor(extractor, metadata_extractors_by) + + +def is_extractor(extractor) -> bool: # pragma: no cover + """Check if a given class is an extractor.""" + return isinstance(extractor, MetadataExtractor) + + +def default_metadata_extractors_by() -> dict: + """Return the default metadata_extractors_by dictionary.""" + d = { + 'priority': {}, + 'source': {}, + 'name': {}, + 'all': [] + } + + for i in MetaPriority: + d['priority'][i] = [] + for i in MetaSource: + d['source'][i] = [] + + return d + + +def _register_default(extractor: type) -> type: + """Register a default extractor.""" + _default_extractors.append(extractor()) + return extractor + + +@_register_default +class NikolaMetadata(MetadataExtractor): + """Extractor for Nikola-style metadata.""" + + name = 'nikola' + source = MetaSource.text + priority = MetaPriority.normal + supports_write = True + split_metadata_re = re.compile('\n\n') + nikola_re = re.compile(r'^\s*\.\. (.*?): (.*)') + map_from = 'nikola' # advertised in values mapping only + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + outdict = {} + for line in source_text.split('\n'): + match = self.nikola_re.match(line) + if match: + k, v = match.group(1), match.group(2) + if v: + outdict[k] = v + return outdict + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format.""" + metadata = metadata.copy() + order = ('title', 'slug', 'date', 'tags', 'category', 'link', 'description', 'type') + f = '.. {0}: {1}' + meta = [] + for k in order: + try: + meta.append(f.format(k, metadata.pop(k))) + except KeyError: + pass + # Leftover metadata (user-specified/non-default). + for k in natsort.natsorted(list(metadata.keys()), alg=natsort.ns.F | natsort.ns.IC): + meta.append(f.format(k, metadata[k])) + data = '\n'.join(meta) + if comment_wrap is True: + comment_wrap = ('<!--', '-->') + if comment_wrap: + return '\n'.join((comment_wrap[0], data, comment_wrap[1], '', '')) + else: + return data + '\n\n' + + +@_register_default +class YAMLMetadata(MetadataExtractor): + """Extractor for YAML metadata.""" + + name = 'yaml' + source = MetaSource.text + conditions = ((MetaCondition.first_line, '---'),) + requirements = [('ruamel.yaml', 'ruamel.yaml', 'YAML')] + supports_write = True + split_metadata_re = re.compile('\n---\n') + map_from = 'yaml' + priority = MetaPriority.specialized + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + from ruamel.yaml import YAML + yaml = YAML(typ='safe') + meta = yaml.load(source_text[4:]) + # We expect empty metadata to be '', not None + for k in meta: + if meta[k] is None: + meta[k] = '' + return meta + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format.""" + from ruamel.yaml import YAML + yaml = YAML(typ='safe') + yaml.default_flow_style = False + stream = StringIO() + yaml.dump(metadata, stream) + stream.seek(0) + return '\n'.join(('---', stream.read().strip(), '---', '')) + + +@_register_default +class TOMLMetadata(MetadataExtractor): + """Extractor for TOML metadata.""" + + name = 'toml' + source = MetaSource.text + conditions = ((MetaCondition.first_line, '+++'),) + requirements = [('toml', 'toml', 'TOML')] + supports_write = True + split_metadata_re = re.compile('\n\\+\\+\\+\n') + map_from = 'toml' + priority = MetaPriority.specialized + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + import toml + return toml.loads(source_text[4:]) + + def write_metadata(self, metadata: dict, comment_wrap=False) -> str: + """Write metadata in this extractor’s format.""" + import toml + return '\n'.join(('+++', toml.dumps(metadata).strip(), '+++', '')) + + +@_register_default +class FilenameRegexMetadata(MetadataExtractor): + """Extractor for filename metadata.""" + + name = 'filename_regex' + source = MetaSource.filename + priority = MetaPriority.fallback + conditions = [(MetaCondition.config_bool, 'FILE_METADATA_REGEXP')] + + def _extract_metadata_from_text(self, source_text: str) -> dict: + """Extract metadata from text.""" + # This extractor does not use the source text, and as such, this method returns an empty dict. + return {} + + def extract_filename(self, filename: str, lang: str) -> dict: + """Try to read the metadata from the filename based on the given re. + + This requires to use symbolic group names in the pattern. + The part to read the metadata from the filename based on a regular + expression is taken from Pelican - pelican/readers.py + """ + match = re.match(self.site.config['FILE_METADATA_REGEXP'], filename) + meta = {} + + if match: + for key, value in match.groupdict().items(): + k = key.lower().strip() # metadata must be lowercase + if k == 'title' and self.site.config['FILE_METADATA_UNSLUGIFY_TITLES']: + meta[k] = unslugify(value, lang, discard_numbers=False) + else: + meta[k] = value + + return meta |
