diff options
Diffstat (limited to 'nikola/post.py')
| -rw-r--r-- | nikola/post.py | 288 |
1 files changed, 229 insertions, 59 deletions
diff --git a/nikola/post.py b/nikola/post.py index 809e5b7..5060583 100644 --- a/nikola/post.py +++ b/nikola/post.py @@ -27,32 +27,43 @@ from __future__ import unicode_literals, print_function import codecs import os +import re +import sys +import string +import unidecode import lxml.html -from . import utils +from .utils import to_datetime, slugify __all__ = ['Post'] +TEASER_REGEXP = re.compile('<!--\s*TEASER_END(:(.+))?\s*-->', re.IGNORECASE) + class Post(object): """Represents a blog post or web page.""" - def __init__(self, source_path, cache_folder, destination, use_in_feeds, - translations, default_lang, blog_url, messages, template_name, - file_metadata_regexp=None): + def __init__( + self, source_path, cache_folder, destination, use_in_feeds, + translations, default_lang, base_url, messages, template_name, + file_metadata_regexp=None, tzinfo=None + ): """Initialize post. - The base path is the .txt post file. From it we calculate + The source path is the .txt post file. From it we calculate the meta file, as well as any translations available, and the .html fragment file path. """ self.translated_to = set([default_lang]) + self.tags = '' + self.date = None self.prev_post = None self.next_post = None - self.blog_url = blog_url + self.base_url = base_url self.is_draft = False + self.is_mathjax = False self.source_path = source_path # posts/blah.txt self.post_name = os.path.splitext(source_path)[0] # posts/blah # cache/posts/blah.html @@ -63,24 +74,27 @@ class Post(object): self.default_lang = default_lang self.messages = messages self.template_name = template_name - if os.path.isfile(self.metadata_path): - with codecs.open(self.metadata_path, "r", "utf8") as meta_file: - meta_data = meta_file.readlines() - while len(meta_data) < 6: - meta_data.append("") - (default_title, default_pagename, self.date, self.tags, - self.link, default_description) = [x.strip() for x in - meta_data][:6] - else: - (default_title, default_pagename, self.date, self.tags, - self.link, default_description) = utils.get_meta( - self.source_path, file_metadata_regexp) + self.meta = get_meta(self, file_metadata_regexp) + + default_title = self.meta.get('title', '') + default_pagename = self.meta.get('slug', '') + default_description = self.meta.get('description', '') + + for k, v in self.meta.items(): + if k not in ['title', 'slug', 'description']: + if sys.version_info[0] == 2: + setattr(self, unidecode.unidecode(unicode(k)), v) # NOQA + else: + setattr(self, k, v) if not default_title or not default_pagename or not self.date: - raise OSError("You must set a title and slug and date! [%s]" % - source_path) + raise OSError("You must set a title (found '{0}'), a slug (found " + "'{1}') and a date (found '{2}')! [in file " + "{3}]".format(default_title, default_pagename, + self.date, source_path)) - self.date = utils.to_datetime(self.date) + # If timezone is set, build localized datetime. + self.date = to_datetime(self.date, tzinfo) self.tags = [x.strip() for x in self.tags.split(',')] self.tags = [_f for _f in self.tags if _f] @@ -89,45 +103,30 @@ class Post(object): self.is_draft = 'draft' in self.tags self.tags = [t for t in self.tags if t != 'draft'] + # If mathjax is a tag, then enable mathjax rendering support + self.is_mathjax = 'mathjax' in self.tags + self.pagenames = {} self.titles = {} self.descriptions = {} - # Load internationalized titles - # TODO: this has gotten much too complicated. Rethink. + + # Load internationalized metadata for lang in translations: if lang == default_lang: self.titles[lang] = default_title self.pagenames[lang] = default_pagename self.descriptions[lang] = default_description else: - metadata_path = self.metadata_path + "." + lang - source_path = self.source_path + "." + lang - if os.path.isfile(source_path): + if os.path.isfile(self.source_path + "." + lang): self.translated_to.add(lang) - try: - if os.path.isfile(metadata_path): - with codecs.open( - metadata_path, "r", "utf8") as meta_file: - meta_data = [x.strip() for x in - meta_file.readlines()] - while len(meta_data) < 6: - meta_data.append("") - self.titles[lang] = meta_data[0] or default_title - self.pagenames[lang] = meta_data[1] or\ - default_pagename - self.descriptions[lang] = meta_data[5] or\ - default_description - else: - ttitle, ppagename, tmp1, tmp2, tmp3, ddescription = \ - utils.get_meta(source_path, file_metadata_regexp) - self.titles[lang] = ttitle or default_title - self.pagenames[lang] = ppagename or default_pagename - self.descriptions[lang] = ddescription or\ - default_description - except: - self.titles[lang] = default_title - self.pagenames[lang] = default_pagename - self.descriptions[lang] = default_description + + meta = self.meta.copy() + meta.update(get_meta(self, file_metadata_regexp, lang)) + + # FIXME this only gets three pieces of metadata from the i18n files + self.titles[lang] = meta.get('title', default_title) + self.pagenames[lang] = meta.get('slug', default_pagename) + self.descriptions[lang] = meta.get('description', default_description) def title(self, lang): """Return localized title.""" @@ -164,12 +163,12 @@ class Post(object): """Return path to the translation's file, or to the original.""" file_name = self.base_path if lang != self.default_lang: - file_name_lang = file_name + ".%s" % lang + file_name_lang = '.'.join((file_name, lang)) if os.path.exists(file_name_lang): file_name = file_name_lang return file_name - def text(self, lang, teaser_only=False): + def text(self, lang, teaser_only=False, strip_html=False): """Read the post file for that language and return its contents""" file_name = self._translated_file_path(lang) @@ -177,22 +176,30 @@ class Post(object): data = post_file.read() if data: - data = lxml.html.make_links_absolute(data, self.permalink()) + data = lxml.html.make_links_absolute(data, self.permalink(lang=lang)) if data and teaser_only: e = lxml.html.fromstring(data) teaser = [] + teaser_str = self.messages[lang]["Read more"] + '...' flag = False for elem in e: elem_string = lxml.html.tostring(elem).decode('utf8') - if '<!-- TEASER_END -->' in elem_string.upper(): + match = TEASER_REGEXP.match(elem_string) + if match: flag = True + if match.group(2): + teaser_str = match.group(2) break teaser.append(elem_string) if flag: - teaser.append('<p><a href="%s">%s...</a></p>' % - (self.permalink(lang), - self.messages[lang]["Read more"])) + teaser.append('<p><a href="{0}">{1}</a></p>'.format( + self.permalink(lang), teaser_str)) data = ''.join(teaser) + + if data and strip_html: + content = lxml.html.fromstring(data) + data = content.text_content().strip() # No whitespace wanted. + return data def destination_path(self, lang, extension='.html'): @@ -206,9 +213,9 @@ class Post(object): pieces = list(os.path.split(self.translations[lang])) pieces += list(os.path.split(self.folder)) pieces += [self.pagenames[lang] + extension] - pieces = [_f for _f in pieces if _f] + pieces = [_f for _f in pieces if _f and _f != '.'] if absolute: - pieces = [self.blog_url] + pieces + pieces = [self.base_url] + pieces else: pieces = [""] + pieces link = "/".join(pieces) @@ -216,3 +223,166 @@ class Post(object): def source_ext(self): return os.path.splitext(self.source_path)[1] + +# Code that fetches metadata from different places + + +def re_meta(line, match=None): + """re.compile for meta""" + if match: + reStr = re.compile('^\.\. {0}: (.*)'.format(re.escape(match))) + else: + reStr = re.compile('^\.\. (.*?): (.*)') + result = reStr.findall(line.strip()) + if match and result: + return (match, result[0]) + elif not match and result: + return (result[0][0], result[0][1].strip()) + else: + return (None,) + + +def _get_metadata_from_filename_by_regex(filename, metadata_regexp): + """ + Tries to ried the metadata from the filename based on the given re. + This requires to use symbolic group names in the pattern. + + The part to read the metadata from the filename based on a regular + expression is taken from Pelican - pelican/readers.py + """ + match = re.match(metadata_regexp, filename) + meta = {} + + if match: + # .items() for py3k compat. + for key, value in match.groupdict().items(): + meta[key.lower()] = value # metadata must be lowercase + + return meta + + +def get_metadata_from_file(source_path, lang=None): + """Extracts metadata from the file itself, by parsing contents.""" + try: + if lang: + source_path = "{0}.{1}".format(source_path, lang) + with codecs.open(source_path, "r", "utf8") as meta_file: + meta_data = [x.strip() for x in meta_file.readlines()] + return _get_metadata_from_file(meta_data) + except Exception: # The file may not exist, for multilingual sites + return {} + + +def _get_metadata_from_file(meta_data): + """Parse file contents and obtain metadata. + + >>> g = _get_metadata_from_file + >>> list(g([]).values()) + [] + >>> str(g(["FooBar","======"])["title"]) + 'FooBar' + >>> str(g(["#FooBar"])["title"]) + 'FooBar' + >>> str(g([".. title: FooBar"])["title"]) + 'FooBar' + >>> 'title' in g(["",".. title: FooBar"]) + False + + """ + meta = {} + + re_md_title = re.compile(r'^{0}([^{0}].*)'.format(re.escape('#'))) + # Assuming rst titles are going to be at least 4 chars long + # otherwise this detects things like ''' wich breaks other markups. + re_rst_title = re.compile(r'^([{0}]{{4,}})'.format(re.escape( + string.punctuation))) + + for i, line in enumerate(meta_data): + if not line: + break + if 'title' not in meta: + match = re_meta(line, 'title') + if match[0]: + meta['title'] = match[1] + if 'title' not in meta: + if re_rst_title.findall(line) and i > 0: + meta['title'] = meta_data[i - 1].strip() + if 'title' not in meta: + if re_md_title.findall(line): + meta['title'] = re_md_title.findall(line)[0] + + match = re_meta(line) + if match[0]: + meta[match[0]] = match[1] + + return meta + + +def get_metadata_from_meta_file(path, lang=None): + """Takes a post path, and gets data from a matching .meta file.""" + meta_path = os.path.splitext(path)[0] + '.meta' + if lang: + meta_path += '.' + lang + if os.path.isfile(meta_path): + with codecs.open(meta_path, "r", "utf8") as meta_file: + meta_data = meta_file.readlines() + while len(meta_data) < 6: + meta_data.append("") + (title, slug, date, tags, link, description) = [ + x.strip() for x in meta_data][:6] + + meta = {} + + if title: + meta['title'] = title + if slug: + meta['slug'] = slug + if date: + meta['date'] = date + if tags: + meta['tags'] = tags + if link: + meta['link'] = link + if description: + meta['description'] = description + + return meta + else: + return {} + + +def get_meta(post, file_metadata_regexp=None, lang=None): + """Get post's meta from source. + + If ``file_metadata_regexp`` is given it will be tried to read + metadata from the filename. + If any metadata is then found inside the file the metadata from the + file will override previous findings. + """ + meta = {} + + meta.update(get_metadata_from_meta_file(post.metadata_path, lang)) + + if meta: + return meta + + if file_metadata_regexp is not None: + meta.update(_get_metadata_from_filename_by_regex(post.source_path, + file_metadata_regexp)) + + meta.update(get_metadata_from_file(post.source_path, lang)) + + if lang is None: + # Only perform these checks for the default language + + if 'slug' not in meta: + # If no slug is found in the metadata use the filename + meta['slug'] = slugify(os.path.splitext( + os.path.basename(post.source_path))[0]) + + if 'title' not in meta: + # If no title is found, use the filename without extension + meta['title'] = os.path.splitext( + os.path.basename(post.source_path))[0] + + return meta |
