1 files changed, 274 insertions, 0 deletions
diff --git a/nikola/metadata_extractors.py b/nikola/metadata_extractors.py
new file mode 100644
index 0000000..2377dc2
--- /dev/null
+++ b/nikola/metadata_extractors.py
@@ -0,0 +1,274 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2012-2020 Chris Warrick, Roberto Alsina and others.
+
+# Permission is hereby granted, free of charge, to any
+# person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the
+# Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the
+# Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice
+# shall be included in all copies or substantial portions of
+# the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+"""Default metadata extractors and helper functions."""
+
+import re
+from enum import Enum
+from io import StringIO
+
+import natsort
+
+from nikola.plugin_categories import MetadataExtractor
+from nikola.utils import unslugify
+
+__all__ = ('MetaCondition', 'MetaPriority', 'MetaSource', 'check_conditions')
+_default_extractors = []
+DEFAULT_EXTRACTOR_NAME = 'nikola'
+DEFAULT_EXTRACTOR = None
+
+
+class MetaCondition(Enum):
+    """Conditions for extracting metadata."""
+
+    config_bool = 1
+    config_present = 2
+    extension = 3
+    compiler = 4
+    first_line = 5
+    never = -1
+
+
+class MetaPriority(Enum):
+    """Priority of metadata.
+
+    An extractor is used if and only if the higher-priority extractors returned nothing.
+    """
+
+    override = 1
+    specialized = 2
+    normal = 3
+    fallback = 4
+
+
+class MetaSource(Enum):
+    """Source of metadata."""
+
+    text = 1
+    filename = 2
+
+
+def check_conditions(post, filename: str, conditions: list, config: dict, source_text: str) -> bool:
+    """Check the conditions for a metadata extractor."""
+    for ct, arg in conditions:
+        if any((
+            ct == MetaCondition.config_bool and not config.get(arg, False),
+            ct == MetaCondition.config_present and arg not in config,
+            ct == MetaCondition.extension and not filename.endswith(arg),
+            ct == MetaCondition.compiler and (post is None or post.compiler.name != arg),
+            ct == MetaCondition.never
+        )):
+            return False
+        elif ct == MetaCondition.first_line:
+            if not source_text or not source_text.startswith(arg + '\n'):
+                return False
+    return True
+
+
+def classify_extractor(extractor: MetadataExtractor, metadata_extractors_by: dict):
+    """Classify an extractor and add it to the metadata_extractors_by dict."""
+    global DEFAULT_EXTRACTOR
+    if extractor.name == DEFAULT_EXTRACTOR_NAME:
+        DEFAULT_EXTRACTOR = extractor
+    metadata_extractors_by['priority'][extractor.priority].append(extractor)
+    metadata_extractors_by['source'][extractor.source].append(extractor)
+    metadata_extractors_by['name'][extractor.name] = extractor
+    metadata_extractors_by['all'].append(extractor)
+
+
+def load_defaults(site, metadata_extractors_by: dict):
+    """Load default metadata extractors."""
+    for extractor in _default_extractors:
+        extractor.site = site
+        classify_extractor(extractor, metadata_extractors_by)
+
+
+def is_extractor(extractor) -> bool:  # pragma: no cover
+    """Check if a given class is an extractor."""
+    return isinstance(extractor, MetadataExtractor)
+
+
+def default_metadata_extractors_by() -> dict:
+    """Return the default metadata_extractors_by dictionary."""
+    d = {
+        'priority': {},
+        'source': {},
+        'name': {},
+        'all': []
+    }
+
+    for i in MetaPriority:
+        d['priority'][i] = []
+    for i in MetaSource:
+        d['source'][i] = []
+
+    return d
+
+
+def _register_default(extractor: type) -> type:
+    """Register a default extractor."""
+    _default_extractors.append(extractor())
+    return extractor
+
+
+@_register_default
+class NikolaMetadata(MetadataExtractor):
+    """Extractor for Nikola-style metadata."""
+
+    name = 'nikola'
+    source = MetaSource.text
+    priority = MetaPriority.normal
+    supports_write = True
+    split_metadata_re = re.compile('\n\n')
+    nikola_re = re.compile(r'^\s*\.\. (.*?): (.*)')
+    map_from = 'nikola'  # advertised in values mapping only
+
+    def _extract_metadata_from_text(self, source_text: str) -> dict:
+        """Extract metadata from text."""
+        outdict = {}
+        for line in source_text.split('\n'):
+            match = self.nikola_re.match(line)
+            if match:
+                k, v = match.group(1), match.group(2)
+                if v:
+                    outdict[k] = v
+        return outdict
+
+    def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
+        """Write metadata in this extractor’s format."""
+        metadata = metadata.copy()
+        order = ('title', 'slug', 'date', 'tags', 'category', 'link', 'description', 'type')
+        f = '.. {0}: {1}'
+        meta = []
+        for k in order:
+            try:
+                meta.append(f.format(k, metadata.pop(k)))
+            except KeyError:
+                pass
+        # Leftover metadata (user-specified/non-default).
+        for k in natsort.natsorted(list(metadata.keys()), alg=natsort.ns.F | natsort.ns.IC):
+            meta.append(f.format(k, metadata[k]))
+        data = '\n'.join(meta)
+        if comment_wrap is True:
+            comment_wrap = ('<!--', '-->')
+        if comment_wrap:
+            return '\n'.join((comment_wrap[0], data, comment_wrap[1], '', ''))
+        else:
+            return data + '\n\n'
+
+
+@_register_default
+class YAMLMetadata(MetadataExtractor):
+    """Extractor for YAML metadata."""
+
+    name = 'yaml'
+    source = MetaSource.text
+    conditions = ((MetaCondition.first_line, '---'),)
+    requirements = [('ruamel.yaml', 'ruamel.yaml', 'YAML')]
+    supports_write = True
+    split_metadata_re = re.compile('\n---\n')
+    map_from = 'yaml'
+    priority = MetaPriority.specialized
+
+    def _extract_metadata_from_text(self, source_text: str) -> dict:
+        """Extract metadata from text."""
+        from ruamel.yaml import YAML
+        yaml = YAML(typ='safe')
+        meta = yaml.load(source_text[4:])
+        # We expect empty metadata to be '', not None
+        for k in meta:
+            if meta[k] is None:
+                meta[k] = ''
+        return meta
+
+    def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
+        """Write metadata in this extractor’s format."""
+        from ruamel.yaml import YAML
+        yaml = YAML(typ='safe')
+        yaml.default_flow_style = False
+        stream = StringIO()
+        yaml.dump(metadata, stream)
+        stream.seek(0)
+        return '\n'.join(('---', stream.read().strip(), '---', ''))
+
+
+@_register_default
+class TOMLMetadata(MetadataExtractor):
+    """Extractor for TOML metadata."""
+
+    name = 'toml'
+    source = MetaSource.text
+    conditions = ((MetaCondition.first_line, '+++'),)
+    requirements = [('toml', 'toml', 'TOML')]
+    supports_write = True
+    split_metadata_re = re.compile('\n\\+\\+\\+\n')
+    map_from = 'toml'
+    priority = MetaPriority.specialized
+
+    def _extract_metadata_from_text(self, source_text: str) -> dict:
+        """Extract metadata from text."""
+        import toml
+        return toml.loads(source_text[4:])
+
+    def write_metadata(self, metadata: dict, comment_wrap=False) -> str:
+        """Write metadata in this extractor’s format."""
+        import toml
+        return '\n'.join(('+++', toml.dumps(metadata).strip(), '+++', ''))
+
+
+@_register_default
+class FilenameRegexMetadata(MetadataExtractor):
+    """Extractor for filename metadata."""
+
+    name = 'filename_regex'
+    source = MetaSource.filename
+    priority = MetaPriority.fallback
+    conditions = [(MetaCondition.config_bool, 'FILE_METADATA_REGEXP')]
+
+    def _extract_metadata_from_text(self, source_text: str) -> dict:
+        """Extract metadata from text."""
+        # This extractor does not use the source text, and as such, this method returns an empty dict.
+        return {}
+
+    def extract_filename(self, filename: str, lang: str) -> dict:
+        """Try to read the metadata from the filename based on the given re.
+
+        This requires to use symbolic group names in the pattern.
+        The part to read the metadata from the filename based on a regular
+        expression is taken from Pelican - pelican/readers.py
+        """
+        match = re.match(self.site.config['FILE_METADATA_REGEXP'], filename)
+        meta = {}
+
+        if match:
+            for key, value in match.groupdict().items():
+                k = key.lower().strip()  # metadata must be lowercase
+                if k == 'title' and self.site.config['FILE_METADATA_UNSLUGIFY_TITLES']:
+                    meta[k] = unslugify(value, lang, discard_numbers=False)
+                else:
+                    meta[k] = value
+
+        return meta