diff options
Diffstat (limited to 'nikola/plugins/task/sitemap/__init__.py')
| -rw-r--r-- | nikola/plugins/task/sitemap/__init__.py | 124 |
1 files changed, 95 insertions, 29 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py index 147bd50..beac6cb 100644 --- a/nikola/plugins/task/sitemap/__init__.py +++ b/nikola/plugins/task/sitemap/__init__.py @@ -30,14 +30,16 @@ import datetime import os try: from urlparse import urljoin, urlparse + import robotparser as robotparser except ImportError: from urllib.parse import urljoin, urlparse # NOQA + import urllib.robotparser as robotparser # NOQA from nikola.plugin_categories import LateTask from nikola.utils import config_changed -header = """<?xml version="1.0" encoding="UTF-8"?> +urlset_header = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" @@ -45,13 +47,29 @@ header = """<?xml version="1.0" encoding="UTF-8"?> http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> """ -url_format = """ <url> +loc_format = """ <url> <loc>{0}</loc> <lastmod>{1}</lastmod> </url> """ -get_lastmod = lambda p: datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0] +urlset_footer = "</urlset>" + +sitemapindex_header = """<?xml version="1.0" encoding="UTF-8"?> +<sitemapindex + xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 + http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> +""" + +sitemap_format = """ <sitemap> + <loc>{0}</loc> + <lastmod>{1}</lastmod> + </sitemap> +""" + +sitemapindex_footer = "</sitemapindex>" def get_base_path(base): @@ -80,12 +98,12 @@ def get_base_path(base): class Sitemap(LateTask): - """Generate google sitemap.""" + """Generate a sitemap.""" name = "sitemap" def gen_tasks(self): - """Generate Google sitemap.""" + """Generate a sitemap.""" kw = { "base_url": self.site.config["BASE_URL"], "site_url": self.site.config["SITE_URL"], @@ -93,28 +111,32 @@ class Sitemap(LateTask): "strip_indexes": self.site.config["STRIP_INDEXES"], "index_file": self.site.config["INDEX_FILE"], "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"], - "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml']) + "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml', '.rss']), + "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"] } - output_path = kw['output_folder'] - sitemap_path = os.path.join(output_path, "sitemap.xml") - base_path = get_base_path(kw['base_url']) - locs = {} output = kw['output_folder'] base_url = kw['base_url'] mapped_exts = kw['mapped_extensions'] + output_path = kw['output_folder'] + sitemapindex_path = os.path.join(output_path, "sitemapindex.xml") + sitemap_path = os.path.join(output_path, "sitemap.xml") + base_path = get_base_path(kw['base_url']) + sitemapindex = {} + urlset = {} + def scan_locs(): - for root, dirs, files in os.walk(output): + for root, dirs, files in os.walk(output, followlinks=True): if not dirs and not files and not kw['sitemap_include_fileless_dirs']: continue # Totally empty, not on sitemap path = os.path.relpath(root, output) # ignore the current directory. path = (path.replace(os.sep, '/') + '/').replace('./', '') - lastmod = get_lastmod(root) + lastmod = self.get_lastmod(root) loc = urljoin(base_url, base_path + path) if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls - locs[loc] = url_format.format(loc, lastmod) + urlset[loc] = loc_format.format(loc, lastmod) for fname in files: if kw['strip_indexes'] and fname == kw['index_file']: continue # We already mapped the folder @@ -124,38 +146,68 @@ class Sitemap(LateTask): if path.endswith(kw['index_file']) and kw['strip_indexes']: # ignore index files when stripping urls continue + if not robot_fetch(path): + continue if path.endswith('.html') or path.endswith('.htm'): - if not u'<!doctype html' in codecs.open(real_path, 'r', 'utf8').read(1024).lower(): - # ignores "html" files without doctype - # alexa-verify, google-site-verification, etc. + try: + if u'<!doctype html' not in codecs.open(real_path, 'r', 'utf8').read(1024).lower(): + # ignores "html" files without doctype + # alexa-verify, google-site-verification, etc. + continue + except UnicodeDecodeError: + # ignore ancient files + # most non-utf8 files are worthless anyways continue - if path.endswith('.xml'): - if not u'<rss' in codecs.open(real_path, 'r', 'utf8').read(512): - # ignores all XML files except those presumed to be RSS + """ put RSS in sitemapindex[] instead of in urlset[], sitemap_path is included after it is generated """ + if path.endswith('.xml') or path.endswith('.rss'): + if u'<rss' in codecs.open(real_path, 'r', 'utf8').read(512) or u'<urlset'and path != sitemap_path: + path = path.replace(os.sep, '/') + lastmod = self.get_lastmod(real_path) + loc = urljoin(base_url, base_path + path) + sitemapindex[loc] = sitemap_format.format(loc, lastmod) continue + else: + continue # ignores all XML files except those presumed to be RSS post = self.site.post_per_file.get(path) - if post and (post.is_draft or post.is_retired or post.publish_later): + if post and (post.is_draft or post.is_private or post.publish_later): continue path = path.replace(os.sep, '/') - lastmod = get_lastmod(real_path) + lastmod = self.get_lastmod(real_path) loc = urljoin(base_url, base_path + path) - locs[loc] = url_format.format(loc, lastmod) + urlset[loc] = loc_format.format(loc, lastmod) + + def robot_fetch(path): + for rule in kw["robots_exclusions"]: + robot = robotparser.RobotFileParser() + robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)]) + if not robot.can_fetch("*", '/' + path): + return False # not robot food + return True def write_sitemap(): # Have to rescan, because files may have been added between # task dep scanning and task execution with codecs.open(sitemap_path, 'wb+', 'utf8') as outf: - outf.write(header) - for k in sorted(locs.keys()): - outf.write(locs[k]) - outf.write("</urlset>") + outf.write(urlset_header) + for k in sorted(urlset.keys()): + outf.write(urlset[k]) + outf.write(urlset_footer) + sitemap_url = urljoin(base_url, base_path + "sitemap.xml") + sitemapindex[sitemap_url] = sitemap_format.format(sitemap_url, self.get_lastmod(sitemap_path)) + + def write_sitemapindex(): + with codecs.open(sitemapindex_path, 'wb+', 'utf8') as outf: + outf.write(sitemapindex_header) + for k in sorted(sitemapindex.keys()): + outf.write(sitemapindex[k]) + outf.write(sitemapindex_footer) # Yield a task to calculate the dependencies of the sitemap # Other tasks can depend on this output, instead of having # to scan locations. def scan_locs_task(): scan_locs() - return {'locations': list(locs.keys())} + return {'locations': list(urlset.keys()) + list(sitemapindex.keys())} yield { "basename": "_scan_locs", @@ -164,7 +216,7 @@ class Sitemap(LateTask): } yield self.group_task() - task = { + yield { "basename": "sitemap", "name": sitemap_path, "targets": [sitemap_path], @@ -174,7 +226,21 @@ class Sitemap(LateTask): "task_dep": ["render_site"], "calc_dep": ["_scan_locs:sitemap"], } - yield task + yield { + "basename": "sitemap", + "name": sitemapindex_path, + "targets": [sitemapindex_path], + "actions": [(write_sitemapindex,)], + "uptodate": [config_changed(kw)], + "clean": True, + "file_dep": [sitemap_path] + } + + def get_lastmod(self, p): + if self.site.invariant: + return '2014-01-01' + else: + return datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0] if __name__ == '__main__': import doctest |
