# -*- coding: utf-8 -*- # Copyright © 2012-2015 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated # documentation files (the "Software"), to deal in the # Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the # Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice # shall be included in all copies or substantial portions of # the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS # OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from __future__ import print_function, absolute_import, unicode_literals import io import datetime import os try: from urlparse import urljoin, urlparse import robotparser as robotparser except ImportError: from urllib.parse import urljoin, urlparse # NOQA import urllib.robotparser as robotparser # NOQA from nikola.plugin_categories import LateTask from nikola.utils import config_changed, apply_filters urlset_header = """ """ loc_format = """ {0} {1}{2} """ urlset_footer = "" sitemapindex_header = """ """ sitemap_format = """ {0} {1} """ alternates_format = """\n """ sitemapindex_footer = "" def get_base_path(base): """returns the path of a base URL if it contains one. >>> get_base_path('http://some.site') == '/' True >>> get_base_path('http://some.site/') == '/' True >>> get_base_path('http://some.site/some/sub-path') == '/some/sub-path/' True >>> get_base_path('http://some.site/some/sub-path/') == '/some/sub-path/' True """ # first parse the base_url for some path base_parsed = urlparse(base) if not base_parsed.path: sub_path = '' else: sub_path = base_parsed.path if sub_path.endswith('/'): return sub_path else: return sub_path + '/' class Sitemap(LateTask): """Generate a sitemap.""" name = "sitemap" def gen_tasks(self): """Generate a sitemap.""" kw = { "base_url": self.site.config["BASE_URL"], "site_url": self.site.config["SITE_URL"], "output_folder": self.site.config["OUTPUT_FOLDER"], "strip_indexes": self.site.config["STRIP_INDEXES"], "index_file": self.site.config["INDEX_FILE"], "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"], "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.xml', '.rss']), "robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"], "filters": self.site.config["FILTERS"], "translations": self.site.config["TRANSLATIONS"], } output = kw['output_folder'] base_url = kw['base_url'] mapped_exts = kw['mapped_extensions'] output_path = kw['output_folder'] sitemapindex_path = os.path.join(output_path, "sitemapindex.xml") sitemap_path = os.path.join(output_path, "sitemap.xml") base_path = get_base_path(kw['base_url']) sitemapindex = {} urlset = {} def scan_locs(): for root, dirs, files in os.walk(output, followlinks=True): if not dirs and not files and not kw['sitemap_include_fileless_dirs']: continue # Totally empty, not on sitemap path = os.path.relpath(root, output) # ignore the current directory. path = (path.replace(os.sep, '/') + '/').replace('./', '') lastmod = self.get_lastmod(root) loc = urljoin(base_url, base_path + path) if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls post = self.site.post_per_file.get(path + kw['index_file']) if post and (post.is_draft or post.is_private or post.publish_later): continue alternates = [] if post: for lang in kw['translations']: alt_url = post.permalink(lang=lang, absolute=True) if loc == alt_url: continue alternates.append(alternates_format.format(lang, alt_url)) urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates)) for fname in files: if kw['strip_indexes'] and fname == kw['index_file']: continue # We already mapped the folder if os.path.splitext(fname)[-1] in mapped_exts: real_path = os.path.join(root, fname) path = os.path.relpath(real_path, output) if path.endswith(kw['index_file']) and kw['strip_indexes']: # ignore index files when stripping urls continue if not robot_fetch(path): continue # read in binary mode to make ancient files work fh = open(real_path, 'rb') filehead = fh.read(1024) fh.close() if path.endswith('.html') or path.endswith('.htm'): """ ignores "html" files without doctype """ if b'