aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/task/sitemap/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/task/sitemap/__init__.py')
-rw-r--r--nikola/plugins/task/sitemap/__init__.py44
1 files changed, 32 insertions, 12 deletions
diff --git a/nikola/plugins/task/sitemap/__init__.py b/nikola/plugins/task/sitemap/__init__.py
index 92d557d..fd781d6 100644
--- a/nikola/plugins/task/sitemap/__init__.py
+++ b/nikola/plugins/task/sitemap/__init__.py
@@ -24,9 +24,12 @@
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""Generate a sitemap."""
+
from __future__ import print_function, absolute_import, unicode_literals
import io
import datetime
+import dateutil.tz
import os
try:
from urlparse import urljoin, urlparse
@@ -42,6 +45,7 @@ from nikola.utils import config_changed, apply_filters
urlset_header = """<?xml version="1.0" encoding="UTF-8"?>
<urlset
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+ xmlns:xhtml="http://www.w3.org/1999/xhtml"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
@@ -58,6 +62,7 @@ urlset_footer = "</urlset>"
sitemapindex_header = """<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+ xmlns:xhtml="http://www.w3.org/1999/xhtml"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
@@ -76,7 +81,7 @@ sitemapindex_footer = "</sitemapindex>"
def get_base_path(base):
- """returns the path of a base URL if it contains one.
+ """Return the path of a base URL if it contains one.
>>> get_base_path('http://some.site') == '/'
True
@@ -101,6 +106,7 @@ def get_base_path(base):
class Sitemap(LateTask):
+
"""Generate a sitemap."""
name = "sitemap"
@@ -114,10 +120,12 @@ class Sitemap(LateTask):
"strip_indexes": self.site.config["STRIP_INDEXES"],
"index_file": self.site.config["INDEX_FILE"],
"sitemap_include_fileless_dirs": self.site.config["SITEMAP_INCLUDE_FILELESS_DIRS"],
- "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.xml', '.rss']),
+ "mapped_extensions": self.site.config.get('MAPPED_EXTENSIONS', ['.atom', '.html', '.htm', '.php', '.xml', '.rss']),
"robots_exclusions": self.site.config["ROBOTS_EXCLUSIONS"],
"filters": self.site.config["FILTERS"],
"translations": self.site.config["TRANSLATIONS"],
+ "tzinfo": self.site.config['__tzinfo__'],
+ "sitemap_plugin_revision": 1,
}
output = kw['output_folder']
@@ -132,6 +140,7 @@ class Sitemap(LateTask):
urlset = {}
def scan_locs():
+ """Scan site locations."""
for root, dirs, files in os.walk(output, followlinks=True):
if not dirs and not files and not kw['sitemap_include_fileless_dirs']:
continue # Totally empty, not on sitemap
@@ -169,17 +178,18 @@ class Sitemap(LateTask):
filehead = fh.read(1024)
fh.close()
- if path.endswith('.html') or path.endswith('.htm'):
+ if path.endswith('.html') or path.endswith('.htm') or path.endswith('.php'):
""" ignores "html" files without doctype """
if b'<!doctype html' not in filehead.lower():
continue
""" ignores "html" files with noindex robot directives """
- robots_directives = [b'<meta content="noindex" name="robots"',
- b'<meta content="none" name="robots"',
- b'<meta name="robots" content="noindex"',
- b'<meta name="robots" content="none"']
- if any([robot_directive in filehead.lower() for robot_directive in robots_directives]):
+ robots_directives = [b'<meta content=noindex name=robots',
+ b'<meta content=none name=robots',
+ b'<meta name=robots content=noindex',
+ b'<meta name=robots content=none']
+ lowquothead = filehead.lower().decode('utf-8', 'ignore').replace('"', '').encode('utf-8')
+ if any([robot_directive in lowquothead for robot_directive in robots_directives]):
continue
# put Atom and RSS in sitemapindex[] instead of in urlset[],
@@ -210,6 +220,7 @@ class Sitemap(LateTask):
urlset[loc] = loc_format.format(loc, lastmod, '\n'.join(alternates))
def robot_fetch(path):
+ """Check if robots can fetch a file."""
for rule in kw["robots_exclusions"]:
robot = robotparser.RobotFileParser()
robot.parse(["User-Agent: *", "Disallow: {0}".format(rule)])
@@ -218,6 +229,7 @@ class Sitemap(LateTask):
return True
def write_sitemap():
+ """Write sitemap to file."""
# Have to rescan, because files may have been added between
# task dep scanning and task execution
with io.open(sitemap_path, 'w+', encoding='utf8') as outf:
@@ -229,16 +241,19 @@ class Sitemap(LateTask):
sitemapindex[sitemap_url] = sitemap_format.format(sitemap_url, self.get_lastmod(sitemap_path))
def write_sitemapindex():
+ """Write sitemap index."""
with io.open(sitemapindex_path, 'w+', encoding='utf8') as outf:
outf.write(sitemapindex_header)
for k in sorted(sitemapindex.keys()):
outf.write(sitemapindex[k])
outf.write(sitemapindex_footer)
- # Yield a task to calculate the dependencies of the sitemap
- # Other tasks can depend on this output, instead of having
- # to scan locations.
def scan_locs_task():
+ """Yield a task to calculate the dependencies of the sitemap.
+
+ Other tasks can depend on this output, instead of having
+ to scan locations.
+ """
scan_locs()
# Generate a list of file dependencies for the actual generation
@@ -290,10 +305,15 @@ class Sitemap(LateTask):
}, kw['filters'])
def get_lastmod(self, p):
+ """Get last modification date."""
if self.site.invariant:
return '2038-01-01'
else:
- return datetime.datetime.fromtimestamp(os.stat(p).st_mtime).isoformat().split('T')[0]
+ # RFC 3339 (web ISO 8601 profile) represented in UTC with Zulu
+ # zone desgignator as recommeded for sitemaps. Second and
+ # microsecond precision is stripped for compatibility.
+ lastmod = datetime.datetime.utcfromtimestamp(os.stat(p).st_mtime).replace(tzinfo=dateutil.tz.gettz('UTC'), second=0, microsecond=0).isoformat().replace('+00:00', 'Z')
+ return lastmod
if __name__ == '__main__':
import doctest