summaryrefslogtreecommitdiffstats
path: root/nikola/plugins/task/sitemap/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/task/sitemap/__init__.py')
-rw-r--r--nikola/plugins/task/sitemap/__init__.py106
1 files changed, 79 insertions, 27 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
index 943e9b2..92d557d 100644
--- a/nikola/plugins/task/sitemap/__init__.py
+++ b/nikola/plugins/task/sitemap/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2014 Roberto Alsina and others.
+# Copyright © 2012-2015 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -36,7 +36,7 @@ except ImportError:
import urllib.robotparser as robotparser # NOQA
from nikola.plugin_categories import LateTask
-from nikola.utils import config_changed
+from nikola.utils import config_changed, apply_filters
urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
@@ -49,7 +49,7 @@ urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
loc_format = """ <url>
<loc>{0}</loc>
- <lastmod>{1}</lastmod>
+ <lastmod>{1}</lastmod>{2}
</url>
"""
@@ -69,6 +69,9 @@ sitemap_format = """ <sitemap>
</sitemap>
"""
+alternates_format = """\n <xhtml:link rel="alternate" hreflang="{0}" href="{1}" />"""
+
+
sitemapindex_footer = "</sitemapindex>"
@@ -111,8 +114,10 @@ class Sitemap(LateTask):
"strip_indexes": self.site.config["STRIP_INDEXES"],
"index_file": self.site.config["INDEX_FILE"],
"sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
- "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml', '.rss']),
- "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"]
+ "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.xml', '.rss']),
+ "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"],
+ "filters": self.site.config["FILTERS"],
+ "translations": self.site.config["TRANSLATIONS"],
}
output = kw['output_folder']
@@ -136,7 +141,17 @@ class Sitemap(LateTask):
lastmod = self.get_lastmod(root)
loc = urljoin(base_url, base_path + path)
if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls
- urlset[loc] = loc_format.format(loc, lastmod)
+ post = self.site.post_per_file.get(path + kw['index_file'])
+ if post and (post.is_draft or post.is_private or post.publish_later):
+ continue
+ alternates = []
+ if post:
+ for lang in kw['translations']:
+ alt_url = post.permalink(lang=lang, absolute=True)
+ if loc == alt_url:
+ continue
+ alternates.append(alternates_format.format(lang, alt_url))
+ urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates))
for fname in files:
if kw['strip_indexes'] and fname == kw['index_file']:
continue # We already mapped the folder
@@ -148,20 +163,30 @@ class Sitemap(LateTask):
continue
if not robot_fetch(path):
continue
+
+ # read in binary mode to make ancient files work
+ fh = open(real_path, 'rb')
+ filehead = fh.read(1024)
+ fh.close()
+
if path.endswith('.html') or path.endswith('.htm'):
- try:
- if u'<!doctype html' not in io.open(real_path, 'r', encoding='utf8').read(1024).lower():
- # ignores "html" files without doctype
- # alexa-verify, google-site-verification, etc.
- continue
- except UnicodeDecodeError:
- # ignore ancient files
- # most non-utf8 files are worthless anyways
+ """ ignores "html" files without doctype """
+ if b'<!doctype html' not in filehead.lower():
continue
- """ put RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
- if path.endswith('.xml') or path.endswith('.rss'):
- filehead = io.open(real_path, 'r', encoding='utf8').read(512)
- if u'<rss' in filehead or (u'<urlset' in filehead and path != sitemap_path):
+
+ """ ignores "html" files with noindex robot directives """
+ robots_directives = [b'<meta content="noindex" name="robots"',
+ b'<meta content="none" name="robots"',
+ b'<meta name="robots" content="noindex"',
+ b'<meta name="robots" content="none"']
+ if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
+ continue
+
+ # put Atom and RSS in sitemapindex[] instead of in urlset[],
+ # sitemap_path is included after it is generated
+ if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
+ known_elm_roots = (b'<feed', b'<rss', b'<urlset')
+ if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
path = path.replace(os.sep, '/')
lastmod = self.get_lastmod(real_path)
loc = urljoin(base_url, base_path + path)
@@ -175,7 +200,14 @@ class Sitemap(LateTask):
path = path.replace(os.sep, '/')
lastmod = self.get_lastmod(real_path)
loc = urljoin(base_url, base_path + path)
- urlset[loc] = loc_format.format(loc, lastmod)
+ alternates = []
+ if post:
+ for lang in kw['translations']:
+ alt_url = post.permalink(lang=lang, absolute=True)
+ if loc == alt_url:
+ continue
+ alternates.append(alternates_format.format(lang, alt_url))
+ urlset[loc] = loc_format.format(loc, lastmod, '\n'.join(alternates))
def robot_fetch(path):
for rule in kw["robots_exclusions"]:
@@ -208,7 +240,27 @@ class Sitemap(LateTask):
# to scan locations.
def scan_locs_task():
scan_locs()
- return {'locations': list(urlset.keys()) + list(sitemapindex.keys())}
+
+ # Generate a list of file dependencies for the actual generation
+ # task, so rebuilds are triggered. (Issue #1032)
+ output = kw["output_folder"]
+ file_dep = []
+
+ for i in urlset.keys():
+ p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
+ if not p.endswith('sitemap.xml') and not os.path.isdir(p):
+ file_dep.append(p)
+ if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
+ file_dep.append(p + 'index.html')
+
+ for i in sitemapindex.keys():
+ p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
+ if not p.endswith('sitemap.xml') and not os.path.isdir(p):
+ file_dep.append(p)
+ if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
+ file_dep.append(p + 'index.html')
+
+ return {'file_dep': file_dep}
yield {
"basename": "_scan_locs",
@@ -217,29 +269,29 @@ class Sitemap(LateTask):
}
yield self.group_task()
- yield {
+ yield apply_filters({
"basename": "sitemap",
"name": sitemap_path,
"targets": [sitemap_path],
"actions": [(write_sitemap,)],
- "uptodate": [config_changed(kw)],
+ "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write')],
"clean": True,
"task_dep": ["render_site"],
"calc_dep": ["_scan_locs:sitemap"],
- }
- yield {
+ }, kw['filters'])
+ yield apply_filters({
"basename": "sitemap",
"name": sitemapindex_path,
"targets": [sitemapindex_path],
"actions": [(write_sitemapindex,)],
- "uptodate": [config_changed(kw)],
+ "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write_index')],
"clean": True,
"file_dep": [sitemap_path]
- }
+ }, kw['filters'])
def get_lastmod(self, p):
if self.site.invariant:
- return '2014-01-01'
+ return '2038-01-01'
else:
return datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]