# -*- coding: utf-8 -*-
# Copyright © 2012-2015 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the
# Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice
# shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from __future__ import print_function, absolute_import, unicode_literals
import io
import datetime
import os
try:
from urlparse import urljoin, urlparse
import robotparser as robotparser
except ImportError:
from urllib.parse import urljoin, urlparse # NOQA
import urllib.robotparser as robotparser # NOQA
from nikola.plugin_categories import LateTask
from nikola.utils import config_changed, apply_filters
urlset_header = """
"""
loc_format = """
{0}
{1}{2}
"""
urlset_footer = ""
sitemapindex_header = """
"""
sitemap_format = """
{0}
{1}
"""
alternates_format = """\n """
sitemapindex_footer = ""
def get_base_path(base):
"""returns the path of a base URL if it contains one.
>>> get_base_path('http://some.site') == '/'
True
>>> get_base_path('http://some.site/') == '/'
True
>>> get_base_path('http://some.site/some/sub-path') == '/some/sub-path/'
True
>>> get_base_path('http://some.site/some/sub-path/') == '/some/sub-path/'
True
"""
# first parse the base_url for some path
base_parsed = urlparse(base)
if not base_parsed.path:
sub_path = ''
else:
sub_path = base_parsed.path
if sub_path.endswith('/'):
return sub_path
else:
return sub_path + '/'
class Sitemap(LateTask):
"""Generate a sitemap."""
name = "sitemap"
def gen_tasks(self):
"""Generate a sitemap."""
kw = {
"base_url": self.site.config["BASE_URL"],
"site_url": self.site.config["SITE_URL"],
"output_folder": self.site.config["OUTPUT_FOLDER"],
"strip_indexes": self.site.config["STRIP_INDEXES"],
"index_file": self.site.config["INDEX_FILE"],
"sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
"mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.xml', '.rss']),
"robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"],
"filters": self.site.config["FILTERS"],
"translations": self.site.config["TRANSLATIONS"],
}
output = kw['output_folder']
base_url = kw['base_url']
mapped_exts = kw['mapped_extensions']
output_path = kw['output_folder']
sitemapindex_path = os.path.join(output_path, "sitemapindex.xml")
sitemap_path = os.path.join(output_path, "sitemap.xml")
base_path = get_base_path(kw['base_url'])
sitemapindex = {}
urlset = {}
def scan_locs():
for root, dirs, files in os.walk(output, followlinks=True):
if not dirs and not files and not kw['sitemap_include_fileless_dirs']:
continue # Totally empty, not on sitemap
path = os.path.relpath(root, output)
# ignore the current directory.
path = (path.replace(os.sep, '/') + '/').replace('./', '')
lastmod = self.get_lastmod(root)
loc = urljoin(base_url, base_path + path)
if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls
post = self.site.post_per_file.get(path + kw['index_file'])
if post and (post.is_draft or post.is_private or post.publish_later):
continue
alternates = []
if post:
for lang in kw['translations']:
alt_url = post.permalink(lang=lang, absolute=True)
if loc == alt_url:
continue
alternates.append(alternates_format.format(lang, alt_url))
urlset[loc] = loc_format.format(loc, lastmod, ''.join(alternates))
for fname in files:
if kw['strip_indexes'] and fname == kw['index_file']:
continue # We already mapped the folder
if os.path.splitext(fname)[-1] in mapped_exts:
real_path = os.path.join(root, fname)
path = os.path.relpath(real_path, output)
if path.endswith(kw['index_file']) and kw['strip_indexes']:
# ignore index files when stripping urls
continue
if not robot_fetch(path):
continue
# read in binary mode to make ancient files work
fh = open(real_path, 'rb')
filehead = fh.read(1024)
fh.close()
if path.endswith('.html') or path.endswith('.htm'):
""" ignores "html" files without doctype """
if b'