From 3a0d66f07b112b6d2bdc2b57bbf717a89a351ce6 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Wed, 3 Feb 2021 19:17:00 -0500 Subject: New upstream version 8.1.2. --- nikola/plugins/task/sitemap.py | 318 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 nikola/plugins/task/sitemap.py (limited to 'nikola/plugins/task/sitemap.py') diff --git a/nikola/plugins/task/sitemap.py b/nikola/plugins/task/sitemap.py new file mode 100644 index 0000000..8bbaa63 --- /dev/null +++ b/nikola/plugins/task/sitemap.py @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2012-2020 Roberto Alsina and others. + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +"""Generate a sitemap.""" + +import datetime +import io +import os +import urllib.robotparser as robotparser +from urllib.parse import urljoin, urlparse + +import dateutil.tz + +from nikola.plugin_categories import LateTask +from nikola.utils import apply_filters, config_changed, encodelink + + +urlset_header = """ + +""" + +loc_format = """ + {0} + {1}{2} + +""" + +urlset_footer = "" + +sitemapindex_header = """ + +""" + +sitemap_format = """ + {0} + {1} + +""" + +alternates_format = """\n """ + + +sitemapindex_footer = "" + + +def get_base_path(base): + """Return the path of a base URL if it contains one. + + >>> get_base_path('http://some.site') == '/' + True + >>> get_base_path('http://some.site/') == '/' + True + >>> get_base_path('http://some.site/some/sub-path') == '/some/sub-path/' + True + >>> get_base_path('http://some.site/some/sub-path/') == '/some/sub-path/' + True + """ + # first parse the base_url for some path + base_parsed = urlparse(base) + + if not base_parsed.path: + sub_path = '' + else: + sub_path = base_parsed.path + if sub_path.endswith('/'): + return sub_path + else: + return sub_path + '/' + + +class Sitemap(LateTask): + """Generate a sitemap.""" + + name = "sitemap" + + def gen_tasks(self): + """Generate a sitemap.""" + kw = { + "base_url": self.site.config["BASE_URL"], + "site_url": self.site.config["SITE_URL"], + "output_folder": self.site.config["OUTPUT_FOLDER"], + "strip_indexes": self.site.config["STRIP_INDEXES"], + "index_file": self.site.config["INDEX_FILE"], + "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.php', '.xml', '.rss']), + "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"], + "filters": self.site.config["FILTERS"], + "translations": self.site.config["TRANSLATIONS"], + "tzinfo": self.site.config['__tzinfo__'], + "sitemap_plugin_revision": 1, + } + + output = kw['output_folder'] + base_url = kw['base_url'] + mapped_exts = kw['mapped_extensions'] + + output_path = kw['output_folder'] + sitemapindex_path = os.path.join(output_path, "sitemapindex.xml") + sitemap_path = os.path.join(output_path, "sitemap.xml") + base_path = get_base_path(kw['base_url']) + sitemapindex = {} + urlset = {} + + def scan_locs(): + """Scan site locations.""" + for root, dirs, files in os.walk(output, followlinks=True): + if not dirs and not files: + continue # Totally empty, not on sitemap + path = os.path.relpath(root, output) + # ignore the current directory. + if path == '.': + path = syspath = '' + else: + syspath = path + os.sep + path = path.replace(os.sep, '/') + '/' + lastmod = self.get_lastmod(root) + loc = urljoin(base_url, base_path + path) + if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls + post = self.site.post_per_file.get(syspath + kw['index_file']) + if post and (post.is_draft or post.is_private or post.publish_later): + continue + alternates = [] + if post: + for lang in post.translated_to: + alt_url = post.permalink(lang=lang, absolute=True) + if encodelink(loc) == alt_url: + continue + alternates.append(alternates_format.format(lang, alt_url)) + urlset[loc] = loc_format.format(encodelink(loc), lastmod, ''.join(alternates)) + for fname in files: + if kw['strip_indexes'] and fname == kw['index_file']: + continue # We already mapped the folder + if os.path.splitext(fname)[-1] in mapped_exts: + real_path = os.path.join(root, fname) + path = syspath = os.path.relpath(real_path, output) + if path.endswith(kw['index_file']) and kw['strip_indexes']: + # ignore index files when stripping urls + continue + if not robot_fetch(path): + continue + + # read in binary mode to make ancient files work + with open(real_path, 'rb') as fh: + filehead = fh.read(1024) + + if path.endswith('.html') or path.endswith('.htm') or path.endswith('.php'): + # Ignores "html" files without doctype + if b'