diff options
Diffstat (limited to 'nikola/plugins/task/sitemap/__init__.py')
| -rw-r--r-- | nikola/plugins/task/sitemap/__init__.py | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py new file mode 100644 index 0000000..f34bc0a --- /dev/null +++ b/nikola/plugins/task/sitemap/__init__.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2012-2013 Roberto Alsina and others. + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import print_function, absolute_import, unicode_literals +import codecs +import datetime +import os +try: + from urlparse import urljoin, urlparse +except ImportError: + from urllib.parse import urljoin, urlparse # NOQA + +from nikola.plugin_categories import LateTask +from nikola.utils import config_changed + + +header = """<?xml version="1.0" encoding="UTF-8"?> +<urlset + xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 + http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> +""" + +url_format = """ <url> + <loc>{0}</loc> + <lastmod>{1}</lastmod> + </url> +""" + +get_lastmod = lambda p: datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0] + + +def get_base_path(base): + """returns the path of a base URL if it contains one. + + >>> get_base_path('http://some.site') == '/' + True + >>> get_base_path('http://some.site/') == '/' + True + >>> get_base_path('http://some.site/some/sub-path') == '/some/sub-path/' + True + >>> get_base_path('http://some.site/some/sub-path/') == '/some/sub-path/' + True + """ + # first parse the base_url for some path + base_parsed = urlparse(base) + + if not base_parsed.path: + sub_path = '' + else: + sub_path = base_parsed.path + if sub_path.endswith('/'): + return sub_path + else: + return sub_path + '/' + + +class Sitemap(LateTask): + """Generate google sitemap.""" + + name = "sitemap" + + def gen_tasks(self): + """Generate Google sitemap.""" + kw = { + "base_url": self.site.config["BASE_URL"], + "site_url": self.site.config["SITE_URL"], + "output_folder": self.site.config["OUTPUT_FOLDER"], + "strip_indexes": self.site.config["STRIP_INDEXES"], + "index_file": self.site.config["INDEX_FILE"], + "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"], + "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml']) + } + output_path = kw['output_folder'] + sitemap_path = os.path.join(output_path, "sitemap.xml") + base_path = get_base_path(kw['base_url']) + locs = {} + + output = kw['output_folder'] + base_url = kw['base_url'] + mapped_exts = kw['mapped_extensions'] + + def scan_locs(): + for root, dirs, files in os.walk(output): + if not dirs and not files and not kw['sitemap_include_fileless_dirs']: + continue # Totally empty, not on sitemap + path = os.path.relpath(root, output) + # ignore the current directory. + path = (path.replace(os.sep, '/') + '/').replace('./', '') + lastmod = get_lastmod(root) + loc = urljoin(base_url, base_path + path) + if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls + locs[loc] = url_format.format(loc, lastmod) + for fname in files: + if kw['strip_indexes'] and fname == kw['index_file']: + continue # We already mapped the folder + if os.path.splitext(fname)[-1] in mapped_exts: + real_path = os.path.join(root, fname) + path = os.path.relpath(real_path, output) + if path.endswith(kw['index_file']) and kw['strip_indexes']: + # ignore index files when stripping urls + continue + if path.endswith('.html') or path.endswith('.htm'): + if not u'<!doctype html' in codecs.open(real_path, 'r', 'utf8').read(1024).lower(): + # ignores "html" files without doctype + # alexa-verify, google-site-verification, etc. + continue + if path.endswith('.xml'): + if not u'<rss' in codecs.open(real_path, 'r', 'utf8').read(512): + # ignores all XML files except those presumed to be RSS + continue + post = self.site.post_per_file.get(path) + if post and (post.is_draft or post.is_retired or post.publish_later): + continue + path = path.replace(os.sep, '/') + lastmod = get_lastmod(real_path) + loc = urljoin(base_url, base_path + path) + locs[loc] = url_format.format(loc, lastmod) + + def write_sitemap(): + # Have to rescan, because files may have been added between + # task dep scanning and task execution + scan_locs() + with codecs.open(sitemap_path, 'wb+', 'utf8') as outf: + outf.write(header) + for k in sorted(locs.keys()): + outf.write(locs[k]) + outf.write("</urlset>") + # Other tasks can depend on this output, instead of having + # to scan locations. + return {'locations': list(locs.keys())} + + scan_locs() + yield self.group_task() + task = { + "basename": "sitemap", + "name": sitemap_path, + "targets": [sitemap_path], + "actions": [(write_sitemap,)], + "uptodate": [config_changed({1: kw, 2: locs})], + "clean": True, + "task_dep": ["render_site"], + } + yield task + +if __name__ == '__main__': + import doctest + doctest.testmod() |
