1 files changed, 95 insertions, 29 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
index 147bd50..beac6cb 100644
--- a/nikola/plugins/task/sitemap/__init__.py
+++ b/nikola/plugins/task/sitemap/__init__.py
@@ -30,14 +30,16 @@ import datetime
 import os
 try:
     from urlparse import urljoin, urlparse
+    import robotparser as robotparser
 except ImportError:
     from urllib.parse import urljoin, urlparse  # NOQA
+    import urllib.robotparser as robotparser  # NOQA
 
 from nikola.plugin_categories import LateTask
 from nikola.utils import config_changed
 
 
-header = """<?xml version="1.0" encoding="UTF-8"?>
+urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
 <urlset
     xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
@@ -45,13 +47,29 @@ header = """<?xml version="1.0" encoding="UTF-8"?>
                         http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
 """
 
-url_format = """ <url>
+loc_format = """ <url>
   <loc>{0}</loc>
   <lastmod>{1}</lastmod>
  </url>
 """
 
-get_lastmod = lambda p: datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]
+urlset_footer = "</urlset>"
+
+sitemapindex_header = """<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex
+    xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
+                        http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+"""
+
+sitemap_format = """ <sitemap>
+  <loc>{0}</loc>
+  <lastmod>{1}</lastmod>
+ </sitemap>
+"""
+
+sitemapindex_footer = "</sitemapindex>"
 
 
 def get_base_path(base):
@@ -80,12 +98,12 @@ def get_base_path(base):
 
 
 class Sitemap(LateTask):
-    """Generate google sitemap."""
+    """Generate a sitemap."""
 
     name = "sitemap"
 
     def gen_tasks(self):
-        """Generate Google sitemap."""
+        """Generate a sitemap."""
         kw = {
             "base_url": self.site.config["BASE_URL"],
             "site_url": self.site.config["SITE_URL"],
@@ -93,28 +111,32 @@ class Sitemap(LateTask):
             "strip_indexes": self.site.config["STRIP_INDEXES"],
             "index_file": self.site.config["INDEX_FILE"],
             "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
-            "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml'])
+            "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml', '.rss']),
+            "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"]
         }
-        output_path = kw['output_folder']
-        sitemap_path = os.path.join(output_path, "sitemap.xml")
-        base_path = get_base_path(kw['base_url'])
-        locs = {}
 
         output = kw['output_folder']
         base_url = kw['base_url']
         mapped_exts = kw['mapped_extensions']
 
+        output_path = kw['output_folder']
+        sitemapindex_path = os.path.join(output_path, "sitemapindex.xml")
+        sitemap_path = os.path.join(output_path, "sitemap.xml")
+        base_path = get_base_path(kw['base_url'])
+        sitemapindex = {}
+        urlset = {}
+
         def scan_locs():
-            for root, dirs, files in os.walk(output):
+            for root, dirs, files in os.walk(output, followlinks=True):
                 if not dirs and not files and not kw['sitemap_include_fileless_dirs']:
                     continue  # Totally empty, not on sitemap
                 path = os.path.relpath(root, output)
                 # ignore the current directory.
                 path = (path.replace(os.sep, '/') + '/').replace('./', '')
-                lastmod = get_lastmod(root)
+                lastmod = self.get_lastmod(root)
                 loc = urljoin(base_url, base_path + path)
                 if kw['index_file'] in files and kw['strip_indexes']:  # ignore folders when not stripping urls
-                    locs[loc] = url_format.format(loc, lastmod)
+                    urlset[loc] = loc_format.format(loc, lastmod)
                 for fname in files:
                     if kw['strip_indexes'] and fname == kw['index_file']:
                         continue  # We already mapped the folder
@@ -124,38 +146,68 @@ class Sitemap(LateTask):
                         if path.endswith(kw['index_file']) and kw['strip_indexes']:
                             # ignore index files when stripping urls
                             continue
+                        if not robot_fetch(path):
+                            continue
                         if path.endswith('.html') or path.endswith('.htm'):
-                            if not u'<!doctype html' in codecs.open(real_path, 'r', 'utf8').read(1024).lower():
-                                # ignores "html" files without doctype
-                                # alexa-verify, google-site-verification, etc.
+                            try:
+                                if u'<!doctype html' not in codecs.open(real_path, 'r', 'utf8').read(1024).lower():
+                                    # ignores "html" files without doctype
+                                    # alexa-verify, google-site-verification, etc.
+                                    continue
+                            except UnicodeDecodeError:
+                                # ignore ancient files
+                                # most non-utf8 files are worthless anyways
                                 continue
-                        if path.endswith('.xml'):
-                            if not u'<rss' in codecs.open(real_path, 'r', 'utf8').read(512):
-                                # ignores all XML files except those presumed to be RSS
+                        """ put RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """
+                        if path.endswith('.xml') or path.endswith('.rss'):
+                            if u'<rss' in codecs.open(real_path, 'r', 'utf8').read(512) or u'<urlset'and path != sitemap_path:
+                                path = path.replace(os.sep, '/')
+                                lastmod = self.get_lastmod(real_path)
+                                loc = urljoin(base_url, base_path + path)
+                                sitemapindex[loc] = sitemap_format.format(loc, lastmod)
                                 continue
+                            else:
+                                continue  # ignores all XML files except those presumed to be RSS
                         post = self.site.post_per_file.get(path)
-                        if post and (post.is_draft or post.is_retired or post.publish_later):
+                        if post and (post.is_draft or post.is_private or post.publish_later):
                             continue
                         path = path.replace(os.sep, '/')
-                        lastmod = get_lastmod(real_path)
+                        lastmod = self.get_lastmod(real_path)
                         loc = urljoin(base_url, base_path + path)
-                        locs[loc] = url_format.format(loc, lastmod)
+                        urlset[loc] = loc_format.format(loc, lastmod)
+
+        def robot_fetch(path):
+            for rule in kw["robots_exclusions"]:
+                robot = robotparser.RobotFileParser()
+                robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)])
+                if not robot.can_fetch("*", '/' + path):
+                    return False  # not robot food
+            return True
 
         def write_sitemap():
             # Have to rescan, because files may have been added between
             # task dep scanning and task execution
             with codecs.open(sitemap_path, 'wb+', 'utf8') as outf:
-                outf.write(header)
-                for k in sorted(locs.keys()):
-                    outf.write(locs[k])
-                outf.write("</urlset>")
+                outf.write(urlset_header)
+                for k in sorted(urlset.keys()):
+                    outf.write(urlset[k])
+                outf.write(urlset_footer)
+            sitemap_url = urljoin(base_url, base_path + "sitemap.xml")
+            sitemapindex[sitemap_url] = sitemap_format.format(sitemap_url, self.get_lastmod(sitemap_path))
+
+        def write_sitemapindex():
+            with codecs.open(sitemapindex_path, 'wb+', 'utf8') as outf:
+                outf.write(sitemapindex_header)
+                for k in sorted(sitemapindex.keys()):
+                    outf.write(sitemapindex[k])
+                outf.write(sitemapindex_footer)
 
         # Yield a task to calculate the dependencies of the sitemap
         # Other tasks can depend on this output, instead of having
         # to scan locations.
         def scan_locs_task():
             scan_locs()
-            return {'locations': list(locs.keys())}
+            return {'locations': list(urlset.keys()) + list(sitemapindex.keys())}
 
         yield {
             "basename": "_scan_locs",
@@ -164,7 +216,7 @@ class Sitemap(LateTask):
         }
 
         yield self.group_task()
-        task = {
+        yield {
             "basename": "sitemap",
             "name": sitemap_path,
             "targets": [sitemap_path],
@@ -174,7 +226,21 @@ class Sitemap(LateTask):
             "task_dep": ["render_site"],
             "calc_dep": ["_scan_locs:sitemap"],
         }
-        yield task
+        yield {
+            "basename": "sitemap",
+            "name": sitemapindex_path,
+            "targets": [sitemapindex_path],
+            "actions": [(write_sitemapindex,)],
+            "uptodate": [config_changed(kw)],
+            "clean": True,
+            "file_dep": [sitemap_path]
+        }
+
+    def get_lastmod(self, p):
+        if self.site.invariant:
+            return '2014-01-01'
+        else:
+            return datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]
 
 if __name__ == '__main__':
     import doctest