From b0b24795b24ee6809397fbbadf42f31f310a219f Mon Sep 17 00:00:00 2001
From: Agustin Henze <tin@sluc.org.ar>
Date: Wed, 8 Jul 2015 07:35:02 -0300
Subject: Imported Upstream version 7.6.0

---
 nikola/plugins/task/sitemap/__init__.py | 106 ++++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 27 deletions(-)

(limited to 'nikola/plugins/task/sitemap')
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
index 943e9b2..92d557d 100644
--- a/nikola/plugins/task/sitemap/__init__.py
+++ b/nikola/plugins/task/sitemap/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright © 2012-2014 Roberto Alsina and others.
+# Copyright © 2012-2015 Roberto Alsina and others.
 
 # Permission is hereby granted, free of charge, to any
 # person obtaining a copy of this software and associated
@@ -36,7 +36,7 @@ except ImportError:
     import urllib.robotparser as robotparser  # NOQA
 
 from nikola.plugin_categories import LateTask
-from nikola.utils import config_changed
+from nikola.utils import config_changed, apply_filters
 
 
 urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
@@ -49,7 +49,7 @@ urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
 
 loc_format = """ <url>
   <loc>{0}</loc>
-  <lastmod>{1}</lastmod>
+  <lastmod>{1}</lastmod>{2}
  </url>
 """
 
@@ -69,6 +69,9 @@ sitemap_format = """ <sitemap>
  </sitemap>
 """
 
+alternates_format = """\n  <xhtml:link rel="alternate" hreflang="{0}" href="{1}" />"""
+
+
 sitemapindex_footer = "</sitemapindex>"
 
 
@@ -111,8 +114,10 @@ class Sitemap(LateTask):
             "strip_indexes": self.site.config["STRIP_INDEXES"],
             "index_file": self.site.config["INDEX_FILE"],
             "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
-            "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml', '.rss']),
-            "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"]
+            "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.xml', '.rss']),
+            "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"],
+            "filters": self.site.config["FILTERS"],
+            "translations": self.site.config["TRANSLATIONS"],
         }
 
         output = kw['output_folder']
@@ -136,7 +141,17 @@ class Sitemap(LateTask):
                 lastmod = self.get_lastmod(root)
                 loc = urljoin(base_url, base_path + path)
                 if kw['index_file'] in files and kw['strip_indexes']:  # ignore folders when not stripping urls
-                    urlset[loc] = loc_format.format(loc, lastmod)
+                    post = self.site.post_per_file.get(path + kw['index_file'])
+                    if post and (post.is_draft or post.is_private or post.publish_later):
+                        continue
+                    alternates = []
+                    if post:
+                        for lang in kw['translations']:
+                            alt_url = post.permalink(lang=lang, absolute=True)
+                            if loc == alt_url:
+                                continue
+                            alternates.append(alternates_format.format(lang, alt_url))
+                    urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates))
                 for fname in files:
                     if kw['strip_indexes'] and fname == kw['index_file']:
                         continue  # We already mapped the folder
@@ -148,20 +163,30 @@ class Sitemap(LateTask):
                             continue
                         if not robot_fetch(path):
                             continue
+
+                        # read in binary mode to make ancient files work
+                        fh = open(real_path, 'rb')
+                        filehead = fh.read(1024)
+                        fh.close()
+
                         if path.endswith('.html') or path.endswith('.htm'):
-                            try:
-                                if u'<!doctype html' not in io.open(real_path, 'r', encoding='utf8').read(1024).lower():
-                                    # ignores "html" files without doctype
-                                    # alexa-verify, google-site-verification, etc.
-                                    continue
-                            except UnicodeDecodeError:
-                                # ignore ancient files
-                                # most non-utf8 files are worthless anyways
+                            """ ignores "html" files without doctype """
+                            if b'<!doctype html' not in filehead.lower():
                                 continue
-                        """ put RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
-                        if path.endswith('.xml') or path.endswith('.rss'):
-                            filehead = io.open(real_path, 'r', encoding='utf8').read(512)
-                            if u'<rss' in filehead or (u'<urlset' in filehead and path != sitemap_path):
+
+                            """ ignores "html" files with noindex robot directives """
+                            robots_directives = [b'<meta content="noindex" name="robots"',
+                                                 b'<meta content="none" name="robots"',
+                                                 b'<meta name="robots" content="noindex"',
+                                                 b'<meta name="robots" content="none"']
+                            if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
+                                continue
+
+                        # put Atom and RSS in sitemapindex[] instead of in urlset[],
+                        # sitemap_path is included after it is generated
+                        if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
+                            known_elm_roots = (b'<feed', b'<rss', b'<urlset')
+                            if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
                                 path = path.replace(os.sep, '/')
                                 lastmod = self.get_lastmod(real_path)
                                 loc = urljoin(base_url, base_path + path)
@@ -175,7 +200,14 @@ class Sitemap(LateTask):
                         path = path.replace(os.sep, '/')
                         lastmod = self.get_lastmod(real_path)
                         loc = urljoin(base_url, base_path + path)
-                        urlset[loc] = loc_format.format(loc, lastmod)
+                        alternates = []
+                        if post:
+                            for lang in kw['translations']:
+                                alt_url = post.permalink(lang=lang, absolute=True)
+                                if loc == alt_url:
+                                    continue
+                                alternates.append(alternates_format.format(lang, alt_url))
+                        urlset[loc] = loc_format.format(loc, lastmod, '\n'.join(alternates))
 
         def robot_fetch(path):
             for rule in kw["robots_exclusions"]:
@@ -208,7 +240,27 @@ class Sitemap(LateTask):
         # to scan locations.
         def scan_locs_task():
             scan_locs()
-            return {'locations': list(urlset.keys()) + list(sitemapindex.keys())}
+
+            # Generate a list of file dependencies for the actual generation
+            # task, so rebuilds are triggered.  (Issue #1032)
+            output = kw["output_folder"]
+            file_dep = []
+
+            for i in urlset.keys():
+                p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
+                if not p.endswith('sitemap.xml') and not os.path.isdir(p):
+                    file_dep.append(p)
+                if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
+                    file_dep.append(p + 'index.html')
+
+            for i in sitemapindex.keys():
+                p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
+                if not p.endswith('sitemap.xml') and not os.path.isdir(p):
+                    file_dep.append(p)
+                if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
+                    file_dep.append(p + 'index.html')
+
+            return {'file_dep': file_dep}
 
         yield {
             "basename": "_scan_locs",
@@ -217,29 +269,29 @@ class Sitemap(LateTask):
         }
 
         yield self.group_task()
-        yield {
+        yield apply_filters({
             "basename": "sitemap",
             "name": sitemap_path,
             "targets": [sitemap_path],
             "actions": [(write_sitemap,)],
-            "uptodate": [config_changed(kw)],
+            "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write')],
             "clean": True,
             "task_dep": ["render_site"],
             "calc_dep": ["_scan_locs:sitemap"],
-        }
-        yield {
+        }, kw['filters'])
+        yield apply_filters({
             "basename": "sitemap",
             "name": sitemapindex_path,
             "targets": [sitemapindex_path],
             "actions": [(write_sitemapindex,)],
-            "uptodate": [config_changed(kw)],
+            "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write_index')],
             "clean": True,
             "file_dep": [sitemap_path]
-        }
+        }, kw['filters'])
 
     def get_lastmod(self, p):
         if self.site.invariant:
-            return '2014-01-01'
+            return '2038-01-01'
         else:
             return datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]
 
-- 
cgit v1.2.3