summaryrefslogtreecommitdiffstats
path: root/nikola/plugins/task/sitemap/__init__.py
diff options
context:
space:
mode:
authorLibravatarAgustin Henze <tin@sluc.org.ar>2013-11-20 16:58:50 -0300
committerLibravatarAgustin Henze <tin@sluc.org.ar>2013-11-20 16:58:50 -0300
commitca94afc07df55cb7fc6fe3b4f3011877b7881195 (patch)
treed81e1f275aa77545f33740723f307a83dde2e0b4 /nikola/plugins/task/sitemap/__init__.py
parentf794eee787e9cde54e6b8f53e45d69c9ddc9936a (diff)
Imported Upstream version 6.2.1upstream/6.2.1
Diffstat (limited to 'nikola/plugins/task/sitemap/__init__.py')
-rw-r--r--nikola/plugins/task/sitemap/__init__.py172
1 files changed, 172 insertions, 0 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
new file mode 100644
index 0000000..f34bc0a
--- /dev/null
+++ b/nikola/plugins/task/sitemap/__init__.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+
+# Copyright © 2012-2013 Roberto Alsina and others.
+
+# Permission is hereby granted, free of charge, to any
+# person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the
+# Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the
+# Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice
+# shall be included in all copies or substantial portions of
+# the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
+# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+from __future__ import print_function, absolute_import, unicode_literals
+import codecs
+import datetime
+import os
+try:
+ from urlparse import urljoin, urlparse
+except ImportError:
+ from urllib.parse import urljoin, urlparse # NOQA
+
+from nikola.plugin_categories import LateTask
+from nikola.utils import config_changed
+
+
+header = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset
+ xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
+ http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+"""
+
+url_format = """ <url>
+ <loc>{0}</loc>
+ <lastmod>{1}</lastmod>
+ </url>
+"""
+
+get_lastmod = lambda p: datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]
+
+
+def get_base_path(base):
+ """returns the path of a base URL if it contains one.
+
+ >>> get_base_path('http://some.site') == '/'
+ True
+ >>> get_base_path('http://some.site/') == '/'
+ True
+ >>> get_base_path('http://some.site/some/sub-path') == '/some/sub-path/'
+ True
+ >>> get_base_path('http://some.site/some/sub-path/') == '/some/sub-path/'
+ True
+ """
+ # first parse the base_url for some path
+ base_parsed = urlparse(base)
+
+ if not base_parsed.path:
+ sub_path = ''
+ else:
+ sub_path = base_parsed.path
+ if sub_path.endswith('/'):
+ return sub_path
+ else:
+ return sub_path + '/'
+
+
+class Sitemap(LateTask):
+ """Generate google sitemap."""
+
+ name = "sitemap"
+
+ def gen_tasks(self):
+ """Generate Google sitemap."""
+ kw = {
+ "base_url": self.site.config["BASE_URL"],
+ "site_url": self.site.config["SITE_URL"],
+ "output_folder": self.site.config["OUTPUT_FOLDER"],
+ "strip_indexes": self.site.config["STRIP_INDEXES"],
+ "index_file": self.site.config["INDEX_FILE"],
+ "sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
+ "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.html', '.htm', '.xml'])
+ }
+ output_path = kw['output_folder']
+ sitemap_path = os.path.join(output_path, "sitemap.xml")
+ base_path = get_base_path(kw['base_url'])
+ locs = {}
+
+ output = kw['output_folder']
+ base_url = kw['base_url']
+ mapped_exts = kw['mapped_extensions']
+
+ def scan_locs():
+ for root, dirs, files in os.walk(output):
+ if not dirs and not files and not kw['sitemap_include_fileless_dirs']:
+ continue # Totally empty, not on sitemap
+ path = os.path.relpath(root, output)
+ # ignore the current directory.
+ path = (path.replace(os.sep, '/') + '/').replace('./', '')
+ lastmod = get_lastmod(root)
+ loc = urljoin(base_url, base_path + path)
+ if kw['index_file'] in files and kw['strip_indexes']: # ignore folders when not stripping urls
+ locs[loc] = url_format.format(loc, lastmod)
+ for fname in files:
+ if kw['strip_indexes'] and fname == kw['index_file']:
+ continue # We already mapped the folder
+ if os.path.splitext(fname)[-1] in mapped_exts:
+ real_path = os.path.join(root, fname)
+ path = os.path.relpath(real_path, output)
+ if path.endswith(kw['index_file']) and kw['strip_indexes']:
+ # ignore index files when stripping urls
+ continue
+ if path.endswith('.html') or path.endswith('.htm'):
+ if not u'<!doctype html' in codecs.open(real_path, 'r', 'utf8').read(1024).lower():
+ # ignores "html" files without doctype
+ # alexa-verify, google-site-verification, etc.
+ continue
+ if path.endswith('.xml'):
+ if not u'<rss' in codecs.open(real_path, 'r', 'utf8').read(512):
+ # ignores all XML files except those presumed to be RSS
+ continue
+ post = self.site.post_per_file.get(path)
+ if post and (post.is_draft or post.is_retired or post.publish_later):
+ continue
+ path = path.replace(os.sep, '/')
+ lastmod = get_lastmod(real_path)
+ loc = urljoin(base_url, base_path + path)
+ locs[loc] = url_format.format(loc, lastmod)
+
+ def write_sitemap():
+ # Have to rescan, because files may have been added between
+ # task dep scanning and task execution
+ scan_locs()
+ with codecs.open(sitemap_path, 'wb+', 'utf8') as outf:
+ outf.write(header)
+ for k in sorted(locs.keys()):
+ outf.write(locs[k])
+ outf.write("</urlset>")
+ # Other tasks can depend on this output, instead of having
+ # to scan locations.
+ return {'locations': list(locs.keys())}
+
+ scan_locs()
+ yield self.group_task()
+ task = {
+ "basename": "sitemap",
+ "name": sitemap_path,
+ "targets": [sitemap_path],
+ "actions": [(write_sitemap,)],
+ "uptodate": [config_changed({1: kw, 2: locs})],
+ "clean": True,
+ "task_dep": ["render_site"],
+ }
+ yield task
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()