summaryrefslogtreecommitdiffstats
path: root/nikola/plugins/task/sitemap.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2021-02-03 19:17:00 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2021-02-03 19:17:00 -0500
commit3a0d66f07b112b6d2bdc2b57bbf717a89a351ce6 (patch)
treea7cf56282e54f05785243bc1e903d6594f2c06ba /nikola/plugins/task/sitemap.py
parent787b97a4cb24330b36f11297c6d3a7a473a907d0 (diff)
New upstream version 8.1.2.upstream/8.1.2
Diffstat (limited to 'nikola/plugins/task/sitemap.py')
-rw-r--r--nikola/plugins/task/sitemap.py318
1 files changed, 318 insertions, 0 deletions
diff --git a/nikola/plugins/task/sitemap.py b/nikola/plugins/task/sitemap.py
new file mode 100644
index 0000000..8bbaa63
--- /dev/null
+++ b/nikola/plugins/task/sitemap.py
@@ -0,0 +1,318 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2012-2020 Roberto Alsina and others.
+
+# Permission is hereby granted, free of charge, to any
+# person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the
+# Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the
+# Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice
+# shall be included in all copies or substantial portions of
+# the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+"""Generate a sitemap."""
+
+import datetime
+import io
+import os
+import urllib.robotparser as robotparser
+from urllib.parse import urljoin, urlparse
+
+import dateutil.tz
+
+from nikola.plugin_categories import LateTask
+from nikola.utils import apply_filters, config_changed, encodelink
+
+
+urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+ xmlns:xhtml="http://www.w3.org/1999/xhtml"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
+ http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+"""
+
+loc_format = """ <url>
+ <loc>{0}</loc>
+ <lastmod>{1}</lastmod>{2}
+ </url>
+"""
+
+urlset_footer = "</urlset>"
+
+sitemapindex_header = """<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+ xmlns:xhtml="http://www.w3.org/1999/xhtml"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
+ http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+"""
+
+sitemap_format = """ <sitemap>
+ <loc>{0}</loc>
+ <lastmod>{1}</lastmod>
+ </sitemap>
+"""
+
+alternates_format = """\n <xhtml:link rel="alternate" hreflang="{0}" href="{1}" />"""
+
+
+sitemapindex_footer = "</sitemapindex>"
+
+
+def get_base_path(base):
+ """Return the path of a base URL if it contains one.
+
+ >>> get_base_path('http://some.site') == '/'
+ True
+ >>> get_base_path('http://some.site/') == '/'
+ True
+ >>> get_base_path('http://some.site/some/sub-path') == '/some/sub-path/'
+ True
+ >>> get_base_path('http://some.site/some/sub-path/') == '/some/sub-path/'
+ True
+ """
+ # first parse the base_url for some path
+ base_parsed = urlparse(base)
+
+ if not base_parsed.path:
+ sub_path = ''
+ else:
+ sub_path = base_parsed.path
+ if sub_path.endswith('/'):
+ return sub_path
+ else:
+ return sub_path + '/'
+
+
+class Sitemap(LateTask):
+ """Generate a sitemap."""
+
+ name = "sitemap"
+
+ def gen_tasks(self):
+ """Generate a sitemap."""
+ kw = {
+ "base_url": self.site.config["BASE_URL"],
+ "site_url": self.site.config["SITE_URL"],
+ "output_folder": self.site.config["OUTPUT_FOLDER"],
+ "strip_indexes": self.site.config["STRIP_INDEXES"],
+ "index_file": self.site.config["INDEX_FILE"],
+ "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.php', '.xml', '.rss']),
+ "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"],
+ "filters": self.site.config["FILTERS"],
+ "translations": self.site.config["TRANSLATIONS"],
+ "tzinfo": self.site.config['__tzinfo__'],
+ "sitemap_plugin_revision": 1,
+ }
+
+ output = kw['output_folder']
+ base_url = kw['base_url']
+ mapped_exts = kw['mapped_extensions']
+
+ output_path = kw['output_folder']
+ sitemapindex_path = os.path.join(output_path, "sitemapindex.xml")
+ sitemap_path = os.path.join(output_path, "sitemap.xml")
+ base_path = get_base_path(kw['base_url'])
+ sitemapindex = {}
+ urlset = {}
+
+ def scan_locs():
+ """Scan site locations."""
+ for root, dirs, files in os.walk(output, followlinks=True):
+ if not dirs and not files:
+ continue # Totally empty, not on sitemap
+ path = os.path.relpath(root, output)
+ # ignore the current directory.
+ if path == '.':
+ path = syspath = ''
+ else:
+ syspath = path + os.sep
+ path = path.replace(os.sep, '/') + '/'
+ lastmod = self.get_lastmod(root)
+ loc = urljoin(base_url, base_path + path)
+ if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls
+ post = self.site.post_per_file.get(syspath + kw['index_file'])
+ if post and (post.is_draft or post.is_private or post.publish_later):
+ continue
+ alternates = []
+ if post:
+ for lang in post.translated_to:
+ alt_url = post.permalink(lang=lang, absolute=True)
+ if encodelink(loc) == alt_url:
+ continue
+ alternates.append(alternates_format.format(lang, alt_url))
+ urlset[loc] = loc_format.format(encodelink(loc), lastmod, ''.join(alternates))
+ for fname in files:
+ if kw['strip_indexes'] and fname == kw['index_file']:
+ continue # We already mapped the folder
+ if os.path.splitext(fname)[-1] in mapped_exts:
+ real_path = os.path.join(root, fname)
+ path = syspath = os.path.relpath(real_path, output)
+ if path.endswith(kw['index_file']) and kw['strip_indexes']:
+ # ignore index files when stripping urls
+ continue
+ if not robot_fetch(path):
+ continue
+
+ # read in binary mode to make ancient files work
+ with open(real_path, 'rb') as fh:
+ filehead = fh.read(1024)
+
+ if path.endswith('.html') or path.endswith('.htm') or path.endswith('.php'):
+ # Ignores "html" files without doctype
+ if b'<!doctype html' not in filehead.lower():
+ continue
+
+ # Ignores "html" files with noindex robot directives
+ robots_directives = [b'<meta content=noindex name=robots',
+ b'<meta content=none name=robots',
+ b'<meta name=robots content=noindex',
+ b'<meta name=robots content=none']
+ lowquothead = filehead.lower().decode('utf-8', 'ignore').replace('"', '').encode('utf-8')
+ if any([robot_directive in lowquothead for robot_directive in robots_directives]):
+ continue
+
+ # put Atom and RSS in sitemapindex[] instead of in urlset[],
+ # sitemap_path is included after it is generated
+ if path.endswith('.xml') or path.endswith('.atom') or path.endswith('.rss'):
+ known_elm_roots = (b'<feed', b'<rss', b'<urlset')
+ if any([elm_root in filehead.lower() for elm_root in known_elm_roots]) and path != sitemap_path:
+ path = path.replace(os.sep, '/')
+ lastmod = self.get_lastmod(real_path)
+ loc = urljoin(base_url, base_path + path)
+ sitemapindex[loc] = sitemap_format.format(encodelink(loc), lastmod)
+ continue
+ else:
+ continue # ignores all XML files except those presumed to be RSS
+ post = self.site.post_per_file.get(syspath)
+ if post and (post.is_draft or post.is_private or post.publish_later):
+ continue
+ path = path.replace(os.sep, '/')
+ lastmod = self.get_lastmod(real_path)
+ loc = urljoin(base_url, base_path + path)
+ alternates = []
+ if post:
+ for lang in post.translated_to:
+ alt_url = post.permalink(lang=lang, absolute=True)
+ if encodelink(loc) == alt_url:
+ continue
+ alternates.append(alternates_format.format(lang, alt_url))
+ urlset[loc] = loc_format.format(encodelink(loc), lastmod, '\n'.join(alternates))
+
+ def robot_fetch(path):
+ """Check if robots can fetch a file."""
+ for rule in kw["robots_exclusions"]:
+ robot = robotparser.RobotFileParser()
+ robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)])
+ if not robot.can_fetch("*", '/' + path):
+ return False # not robot food
+ return True
+
+ def write_sitemap():
+ """Write sitemap to file."""
+ # Have to rescan, because files may have been added between
+ # task dep scanning and task execution
+ with io.open(sitemap_path, 'w+', encoding='utf8') as outf:
+ outf.write(urlset_header)
+ for k in sorted(urlset.keys()):
+ outf.write(urlset[k])
+ outf.write(urlset_footer)
+ sitemap_url = urljoin(base_url, base_path + "sitemap.xml")
+ sitemapindex[sitemap_url] = sitemap_format.format(sitemap_url, self.get_lastmod(sitemap_path))
+
+ def write_sitemapindex():
+ """Write sitemap index."""
+ with io.open(sitemapindex_path, 'w+', encoding='utf8') as outf:
+ outf.write(sitemapindex_header)
+ for k in sorted(sitemapindex.keys()):
+ outf.write(sitemapindex[k])
+ outf.write(sitemapindex_footer)
+
+ def scan_locs_task():
+ """Yield a task to calculate the dependencies of the sitemap.
+
+ Other tasks can depend on this output, instead of having
+ to scan locations.
+ """
+ scan_locs()
+
+ # Generate a list of file dependencies for the actual generation
+ # task, so rebuilds are triggered. (Issue #1032)
+ output = kw["output_folder"]
+ file_dep = []
+
+ for i in urlset.keys():
+ p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
+ if not p.endswith('sitemap.xml') and not os.path.isdir(p):
+ file_dep.append(p)
+ if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
+ file_dep.append(p + 'index.html')
+
+ for i in sitemapindex.keys():
+ p = os.path.join(output, urlparse(i).path.replace(base_path, '', 1))
+ if not p.endswith('sitemap.xml') and not os.path.isdir(p):
+ file_dep.append(p)
+ if os.path.isdir(p) and os.path.exists(os.path.join(p, 'index.html')):
+ file_dep.append(p + 'index.html')
+
+ return {'file_dep': file_dep}
+
+ yield {
+ "basename": "_scan_locs",
+ "name": "sitemap",
+ "actions": [(scan_locs_task)]
+ }
+
+ yield self.group_task()
+ yield apply_filters({
+ "basename": "sitemap",
+ "name": sitemap_path,
+ "targets": [sitemap_path],
+ "actions": [(write_sitemap,)],
+ "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write')],
+ "clean": True,
+ "task_dep": ["render_site"],
+ "calc_dep": ["_scan_locs:sitemap"],
+ }, kw['filters'])
+ yield apply_filters({
+ "basename": "sitemap",
+ "name": sitemapindex_path,
+ "targets": [sitemapindex_path],
+ "actions": [(write_sitemapindex,)],
+ "uptodate": [config_changed(kw, 'nikola.plugins.task.sitemap:write_index')],
+ "clean": True,
+ "file_dep": [sitemap_path]
+ }, kw['filters'])
+
+ def get_lastmod(self, p):
+ """Get last modification date."""
+ if self.site.invariant:
+ return '2038-01-01'
+ else:
+ # RFC 3339 (web ISO 8601 profile) represented in UTC with Zulu
+ # zone desgignator as recommeded for sitemaps. Second and
+ # microsecond precision is stripped for compatibility.
+ lastmod = datetime.datetime.utcfromtimestamp(os.stat(p).st_mtime).replace(tzinfo=dateutil.tz.gettz('UTC'), second=0, microsecond=0).isoformat().replace('+00:00', 'Z')
+ return lastmod
+
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()