aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/metadata_extractors.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/metadata_extractors.py')
-rw-r--r--nikola/metadata_extractors.py274
1 files changed, 274 insertions, 0 deletions
diff --git a/nikola/metadata_extractors.py b/nikola/metadata_extractors.py
new file mode 100644
index 0000000..2377dc2
--- /dev/null
+++ b/nikola/metadata_extractors.py
@@ -0,0 +1,274 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2012-2020 Chris Warrick, Roberto Alsina and others.
+
+# Permission is hereby granted, free of charge, to any
+# person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the
+# Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the
+# Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice
+# shall be included in all copies or substantial portions of
+# the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+"""Default metadata extractors and helper functions."""
+
+import re
+from enum import Enum
+from io import StringIO
+
+import natsort
+
+from nikola.plugin_categories import MetadataExtractor
+from nikola.utils import unslugify
+
+__all__ = ('MetaCondition', 'MetaPriority', 'MetaSource', 'check_conditions')
+_default_extractors = []
+DEFAULT_EXTRACTOR_NAME = 'nikola'
+DEFAULT_EXTRACTOR = None
+
+
+class MetaCondition(Enum):
+ """Conditions for extracting metadata."""
+
+ config_bool = 1
+ config_present = 2
+ extension = 3
+ compiler = 4
+ first_line = 5
+ never = -1
+
+
+class MetaPriority(Enum):
+ """Priority of metadata.
+
+ An extractor is used if and only if the higher-priority extractors returned nothing.
+ """
+
+ override = 1
+ specialized = 2
+ normal = 3
+ fallback = 4
+
+
+class MetaSource(Enum):
+ """Source of metadata."""
+
+ text = 1
+ filename = 2
+
+
+def check_conditions(post, filename: str, conditions: list, config: dict, source_text: str) -> bool:
+ """Check the conditions for a metadata extractor."""
+ for ct, arg in conditions:
+ if any((
+ ct == MetaCondition.config_bool and not config.get(arg, False),
+ ct == MetaCondition.config_present and arg not in config,
+ ct == MetaCondition.extension and not filename.endswith(arg),
+ ct == MetaCondition.compiler and (post is None or post.compiler.name != arg),
+ ct == MetaCondition.never
+ )):
+ return False
+ elif ct == MetaCondition.first_line:
+ if not source_text or not source_text.startswith(arg + '\n'):
+ return False
+ return True
+
+
+def classify_extractor(extractor: MetadataExtractor, metadata_extractors_by: dict):
+ """Classify an extractor and add it to the metadata_extractors_by dict."""
+ global DEFAULT_EXTRACTOR
+ if extractor.name == DEFAULT_EXTRACTOR_NAME:
+ DEFAULT_EXTRACTOR = extractor
+ metadata_extractors_by['priority'][extractor.priority].append(extractor)
+ metadata_extractors_by['source'][extractor.source].append(extractor)
+ metadata_extractors_by['name'][extractor.name] = extractor
+ metadata_extractors_by['all'].append(extractor)
+
+
+def load_defaults(site, metadata_extractors_by: dict):
+ """Load default metadata extractors."""
+ for extractor in _default_extractors:
+ extractor.site = site
+ classify_extractor(extractor, metadata_extractors_by)
+
+
+def is_extractor(extractor) -> bool: # pragma: no cover
+ """Check if a given class is an extractor."""
+ return isinstance(extractor, MetadataExtractor)
+
+
+def default_metadata_extractors_by() -> dict:
+ """Return the default metadata_extractors_by dictionary."""
+ d = {
+ 'priority': {},
+ 'source': {},
+ 'name': {},
+ 'all': []
+ }
+
+ for i in MetaPriority:
+ d['priority'][i] = []
+ for i in MetaSource:
+ d['source'][i] = []
+
+ return d
+
+
+def _register_default(extractor: type) -> type:
+ """Register a default extractor."""
+ _default_extractors.append(extractor())
+ return extractor
+
+
+@_register_default
+class NikolaMetadata(MetadataExtractor):
+ """Extractor for Nikola-style metadata."""
+
+ name = 'nikola'
+ source = MetaSource.text
+ priority = MetaPriority.normal
+ supports_write = True
+ split_metadata_re = re.compile('\n\n')
+ nikola_re = re.compile(r'^\s*\.\. (.*?): (.*)')
+ map_from = 'nikola' # advertised in values mapping only
+
+ def _extract_metadata_from_text(self, source_text: str) -> dict:
+ """Extract metadata from text."""
+ outdict = {}
+ for line in source_text.split('\n'):
+ match = self.nikola_re.match(line)
+ if match:
+ k, v = match.group(1), match.group(2)
+ if v:
+ outdict[k] = v
+ return outdict
+
+ def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
+ """Write metadata in this extractor’s format."""
+ metadata = metadata.copy()
+ order = ('title', 'slug', 'date', 'tags', 'category', 'link', 'description', 'type')
+ f = '.. {0}: {1}'
+ meta = []
+ for k in order:
+ try:
+ meta.append(f.format(k, metadata.pop(k)))
+ except KeyError:
+ pass
+ # Leftover metadata (user-specified/non-default).
+ for k in natsort.natsorted(list(metadata.keys()), alg=natsort.ns.F | natsort.ns.IC):
+ meta.append(f.format(k, metadata[k]))
+ data = '\n'.join(meta)
+ if comment_wrap is True:
+ comment_wrap = ('<!--', '-->')
+ if comment_wrap:
+ return '\n'.join((comment_wrap[0], data, comment_wrap[1], '', ''))
+ else:
+ return data + '\n\n'
+
+
+@_register_default
+class YAMLMetadata(MetadataExtractor):
+ """Extractor for YAML metadata."""
+
+ name = 'yaml'
+ source = MetaSource.text
+ conditions = ((MetaCondition.first_line, '---'),)
+ requirements = [('ruamel.yaml', 'ruamel.yaml', 'YAML')]
+ supports_write = True
+ split_metadata_re = re.compile('\n---\n')
+ map_from = 'yaml'
+ priority = MetaPriority.specialized
+
+ def _extract_metadata_from_text(self, source_text: str) -> dict:
+ """Extract metadata from text."""
+ from ruamel.yaml import YAML
+ yaml = YAML(typ='safe')
+ meta = yaml.load(source_text[4:])
+ # We expect empty metadata to be '', not None
+ for k in meta:
+ if meta[k] is None:
+ meta[k] = ''
+ return meta
+
+ def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
+ """Write metadata in this extractor’s format."""
+ from ruamel.yaml import YAML
+ yaml = YAML(typ='safe')
+ yaml.default_flow_style = False
+ stream = StringIO()
+ yaml.dump(metadata, stream)
+ stream.seek(0)
+ return '\n'.join(('---', stream.read().strip(), '---', ''))
+
+
+@_register_default
+class TOMLMetadata(MetadataExtractor):
+ """Extractor for TOML metadata."""
+
+ name = 'toml'
+ source = MetaSource.text
+ conditions = ((MetaCondition.first_line, '+++'),)
+ requirements = [('toml', 'toml', 'TOML')]
+ supports_write = True
+ split_metadata_re = re.compile('\n\\+\\+\\+\n')
+ map_from = 'toml'
+ priority = MetaPriority.specialized
+
+ def _extract_metadata_from_text(self, source_text: str) -> dict:
+ """Extract metadata from text."""
+ import toml
+ return toml.loads(source_text[4:])
+
+ def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
+ """Write metadata in this extractor’s format."""
+ import toml
+ return '\n'.join(('+++', toml.dumps(metadata).strip(), '+++', ''))
+
+
+@_register_default
+class FilenameRegexMetadata(MetadataExtractor):
+ """Extractor for filename metadata."""
+
+ name = 'filename_regex'
+ source = MetaSource.filename
+ priority = MetaPriority.fallback
+ conditions = [(MetaCondition.config_bool, 'FILE_METADATA_REGEXP')]
+
+ def _extract_metadata_from_text(self, source_text: str) -> dict:
+ """Extract metadata from text."""
+ # This extractor does not use the source text, and as such, this method returns an empty dict.
+ return {}
+
+ def extract_filename(self, filename: str, lang: str) -> dict:
+ """Try to read the metadata from the filename based on the given re.
+
+ This requires to use symbolic group names in the pattern.
+ The part to read the metadata from the filename based on a regular
+ expression is taken from Pelican - pelican/readers.py
+ """
+ match = re.match(self.site.config['FILE_METADATA_REGEXP'], filename)
+ meta = {}
+
+ if match:
+ for key, value in match.groupdict().items():
+ k = key.lower().strip() # metadata must be lowercase
+ if k == 'title' and self.site.config['FILE_METADATA_UNSLUGIFY_TITLES']:
+ meta[k] = unslugify(value, lang, discard_numbers=False)
+ else:
+ meta[k] = value
+
+ return meta