diff options
Diffstat (limited to 'nikola/plugins/task/sitemap/__init__.py')
| -rw-r--r-- | nikola/plugins/task/sitemap/__init__.py | 106 |
1 files changed, 79 insertions, 27 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py index 943e9b2..92d557d 100644 --- a/nikola/plugins/task/sitemap/__init__.py +++ b/nikola/plugins/task/sitemap/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2014 Roberto Alsina and others. +# Copyright © 2012-2015 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -36,7 +36,7 @@ except ImportError: import urllib.robotparser as robotparser # NOQA from nikola.plugin_categories import LateTask -from nikola.utils import config_changed +from nikola.utils import config_changed, apply_filters urlset_header = """<?xml version="1.0" encoding="UTF-8"?> @@ -49,7 +49,7 @@ urlset_header = """<?xml version="1.0" encoding="UTF-8"?> loc_format = """ <url> <loc>{0}</loc> - <lastmod>{1}</lastmod> + <lastmod>{1}</lastmod>{2} </url> """ @@ -69,6 +69,9 @@ sitemap_format = """ <sitemap> </sitemap> """ +alternates_format = """\n <xhtml:link rel="alternate" hreflang="{0}" href="{1}" />""" + + sitemapindex_footer = "</sitemapindex>" @@ -111,8 +114,10 @@ class Sitemap(LateTask): "strip_indexes": self.site.config["STRIP_INDEXES"], "index_file": self.site.config["INDEX_FILE"], "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"], - "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml', '.rss']), - "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"] + "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.xml', '.rss']), + "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"], + "filters": self.site.config["FILTERS"], + "translations": self.site.config["TRANSLATIONS"], } output = kw['output_folder'] @@ -136,7 +141,17 @@ class Sitemap(LateTask): lastmod = self.get_lastmod(root) loc = urljoin(base_url, base_path + path) if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls - urlset[loc] = loc_format.format(loc, lastmod) + post = self.site.post_per_file.get(path + kw['index_file']) + if post and (post.is_draft or post.is_private or post.publish_later): + continue + alternates = [] + if post: + for lang in kw['translations']: + alt_url = post.permalink(lang=lang, absolute=True) + if loc == alt_url: + continue + alternates.append(alternates_format.format(lang, alt_url)) + urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates)) for fname in files: if kw['strip_indexes'] and fname == kw['index_file']: continue # We already mapped the folder @@ -148,20 +163,30 @@ class Sitemap(LateTask): continue if not robot_fetch(path): continue + + # read in binary mode to make ancient files work + fh = open(real_path, 'rb') + filehead = fh.read(1024) + fh.close() + if path.endswith('.html') or path.endswith('.htm'): - try: - if u'<!doctype html' not in io.open(real_path, 'r', encoding='utf8').read(1024).lower(): - # ignores "html" files without doctype - # alexa-verify, google-site-verification, etc. - continue - except UnicodeDecodeError: - # ignore ancient files - # most non-utf8 files are worthless anyways + """ ignores "html" files without doctype """ + if b'<!doctype html' not in filehead.lower(): continue - """ put RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """ - if path.endswith('.xml') or path.endswith('.rss'): - filehead = io.open(real_path, 'r', encoding='utf8').read(512) - if u'<rss' in filehead or (u'<urlset' in filehead and path != sitemap_path): + + """ ignores "html" files with noindex robot directives """ + robots_directives = [b'<meta content="noindex" name="robots"', + b'<meta content="none" name="robots"', + b'<meta name="robots" content="noindex"', + b'<meta name="robots" content="none"'] + if any([robot_directive in filehead.lower() for robot_directive in robots_directives]): + continue + + # put Atom and RSS in sitemapindex[] instead of in urlset[], + # sitemap_path is included after it is generated + if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'): + known_elm_roots = (b'<feed', b'<rss', b'<urlset') + if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path: path = path.replace(os.sep, '/') lastmod = self.get_lastmod(real_path) loc = urljoin(base_url, base_path + path) @@ -175,7 +200,14 @@ class Sitemap(LateTask): path = path.replace(os.sep, '/') lastmod = self.get_lastmod(real_path) loc = urljoin(base_url, base_path + path) - urlset[loc] = loc_format.format(loc, lastmod) + alternates = [] + if post: + for lang in kw['translations']: + alt_url = post.permalink(lang=lang, absolute=True) + if loc == alt_url: + continue + alternates.append(alternates_format.format(lang, alt_url)) + urlset[loc] = loc_format.format(loc, lastmod, '\n'.join(alternates)) def robot_fetch(path): for rule in kw["robots_exclusions"]: @@ -208,7 +240,27 @@ class Sitemap(LateTask): # to scan locations. def scan_locs_task(): scan_locs() - return {'locations': list(urlset.keys()) + list(sitemapindex.keys())} + + # Generate a list of file dependencies for the actual generation + # task, so rebuilds are triggered. (Issue #1032) + output = kw["output_folder"] + file_dep = [] + + for i in urlset.keys(): + p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1)) + if not p.endswith('sitemap.xml') and not os.path.isdir(p): + file_dep.append(p) + if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')): + file_dep.append(p + 'index.html') + + for i in sitemapindex.keys(): + p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1)) + if not p.endswith('sitemap.xml') and not os.path.isdir(p): + file_dep.append(p) + if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')): + file_dep.append(p + 'index.html') + + return {'file_dep': file_dep} yield { "basename": "_scan_locs", @@ -217,29 +269,29 @@ class Sitemap(LateTask): } yield self.group_task() - yield { + yield apply_filters({ "basename": "sitemap", "name": sitemap_path, "targets": [sitemap_path], "actions": [(write_sitemap,)], - "uptodate": [config_changed(kw)], + "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write')], "clean": True, "task_dep": ["render_site"], "calc_dep": ["_scan_locs:sitemap"], - } - yield { + }, kw['filters']) + yield apply_filters({ "basename": "sitemap", "name": sitemapindex_path, "targets": [sitemapindex_path], "actions": [(write_sitemapindex,)], - "uptodate": [config_changed(kw)], + "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write_index')], "clean": True, "file_dep": [sitemap_path] - } + }, kw['filters']) def get_lastmod(self, p): if self.site.invariant: - return '2014-01-01' + return '2038-01-01' else: return datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0] |
