aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/check.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command/check.py')
-rw-r--r--nikola/plugins/command/check.py134
1 files changed, 99 insertions, 35 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py
index bd254f4..a9bc44a 100644
--- a/nikola/plugins/command/check.py
+++ b/nikola/plugins/command/check.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2014 Roberto Alsina and others.
+# Copyright © 2012-2015 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -25,6 +25,7 @@
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from __future__ import print_function
+from collections import defaultdict
import os
import re
import sys
@@ -34,21 +35,36 @@ try:
except ImportError:
from urllib.parse import unquote, urlparse, urljoin, urldefrag # NOQA
+from doit.loader import generate_tasks
import lxml.html
+import requests
from nikola.plugin_categories import Command
from nikola.utils import get_logger
+def _call_nikola_list(site):
+ files = []
+ deps = defaultdict(list)
+ for task in generate_tasks('render_site', site.gen_tasks('render_site', "Task", '')):
+ files.extend(task.targets)
+ for target in task.targets:
+ deps[target].extend(task.file_dep)
+ for task in generate_tasks('post_render', site.gen_tasks('render_site', "LateTask", '')):
+ files.extend(task.targets)
+ for target in task.targets:
+ deps[target].extend(task.file_dep)
+ return files, deps
+
+
def real_scan_files(site):
task_fnames = set([])
real_fnames = set([])
output_folder = site.config['OUTPUT_FOLDER']
# First check that all targets are generated in the right places
- for task in os.popen('nikola list --all', 'r').readlines():
- task = task.strip()
- if output_folder in task and ':' in task:
- fname = task.split(':', 1)[-1]
+ for fname in _call_nikola_list(site)[0]:
+ fname = fname.strip()
+ if fname.startswith(output_folder):
task_fnames.add(fname)
# And now check that there are no non-target files
for root, dirs, files in os.walk(output_folder, followlinks=True):
@@ -68,7 +84,7 @@ def fs_relpath_from_url_path(url_path):
url_path = unquote(url_path)
# in windows relative paths don't begin with os.sep
if sys.platform == 'win32' and len(url_path):
- url_path = url_path[1:].replace('/', '\\')
+ url_path = url_path.replace('/', '\\')
return url_path
@@ -78,7 +94,7 @@ class CommandCheck(Command):
name = "check"
logger = None
- doc_usage = "-l [--find-sources] | -f"
+ doc_usage = "[-v] (-l [--find-sources] [-r] | -f [--clean-files])"
doc_purpose = "check links and files in the generated site"
cmd_options = [
{
@@ -119,11 +135,18 @@ class CommandCheck(Command):
'default': False,
'help': 'Be more verbose.',
},
+ {
+ 'name': 'remote',
+ 'long': 'remote',
+ 'short': 'r',
+ 'type': bool,
+ 'default': False,
+ 'help': 'Check that remote links work.',
+ },
]
def _execute(self, options, args):
"""Check the generated site."""
-
self.logger = get_logger('check', self.site.loghandlers)
if not options['links'] and not options['files'] and not options['clean']:
@@ -134,59 +157,103 @@ class CommandCheck(Command):
else:
self.logger.level = 4
if options['links']:
- failure = self.scan_links(options['find_sources'])
+ failure = self.scan_links(options['find_sources'], options['remote'])
if options['files']:
failure = self.scan_files()
if options['clean']:
failure = self.clean_files()
if failure:
- sys.exit(1)
+ return 1
existing_targets = set([])
+ checked_remote_targets = {}
- def analyze(self, task, find_sources=False):
+ def analyze(self, fname, find_sources=False, check_remote=False):
rv = False
self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']]
base_url = urlparse(self.site.config['BASE_URL'])
self.existing_targets.add(self.site.config['SITE_URL'])
self.existing_targets.add(self.site.config['BASE_URL'])
url_type = self.site.config['URL_TYPE']
- if url_type == 'absolute':
- url_netloc_to_root = urlparse(self.site.config['SITE_URL']).path
+
+ deps = {}
+ if find_sources:
+ deps = _call_nikola_list(self.site)[1]
+
+ if url_type in ('absolute', 'full_path'):
+ url_netloc_to_root = urlparse(self.site.config['BASE_URL']).path
try:
- filename = task.split(":")[-1]
- d = lxml.html.fromstring(open(filename).read())
+ filename = fname
+
+ if filename.startswith(self.site.config['CACHE_FOLDER']):
+ # Do not look at links in the cache, which are not parsed by
+ # anyone and may result in false positives. Problems arise
+ # with galleries, for example. Full rationale: (Issue #1447)
+ self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename))
+ return False
+
+ if not os.path.exists(fname):
+ # Quietly ignore files that don’t exist; use `nikola check -f` instead (Issue #1831)
+ return False
+
+ d = lxml.html.fromstring(open(filename, 'rb').read())
for l in d.iterlinks():
- target = l[0].attrib[l[1]]
+ target = l[2]
if target == "#":
continue
target, _ = urldefrag(target)
parsed = urlparse(target)
- # Absolute links when using only paths, skip.
- if (parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path'):
- continue
+ # Warn about links from https to http (mixed-security)
+ if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http":
+ self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target))
# Absolute links to other domains, skip
- if (parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc:
+ # Absolute links when using only paths, skip.
+ if ((parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc) or \
+ ((parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path')):
+ if not check_remote or parsed.scheme not in ["http", "https"]:
+ continue
+ if parsed.netloc == base_url.netloc: # absolute URL to self.site
+ continue
+ if target in self.checked_remote_targets: # already checked this exact target
+ if self.checked_remote_targets[target] > 399:
+ self.logger.warn("Broken link in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target]))
+ continue
+ # Check the remote link works
+ req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'} # I’m a real boy!
+ resp = requests.head(target, headers=req_headers)
+ self.checked_remote_targets[target] = resp.status_code
+ if resp.status_code > 399: # Error
+ self.logger.warn("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code))
+ continue
+ elif resp.status_code <= 399: # The address leads *somewhere* that is not an error
+ self.logger.debug("Successfully checked remote link in {0}: {1} [HTTP: {2}]".format(filename, target, resp.status_code))
+ continue
+ self.logger.warn("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target))
continue
if url_type == 'rel_path':
- target_filename = os.path.abspath(
- os.path.join(os.path.dirname(filename), unquote(target)))
+ if target.startswith('/'):
+ target_filename = os.path.abspath(
+ os.path.join(self.site.config['OUTPUT_FOLDER'], unquote(target.lstrip('/'))))
+ else: # Relative path
+ target_filename = os.path.abspath(
+ os.path.join(os.path.dirname(filename), unquote(target)))
elif url_type in ('full_path', 'absolute'):
if url_type == 'absolute':
# convert to 'full_path' case, ie url relative to root
- url_rel_path = target.path[len(url_netloc_to_root):]
+ url_rel_path = parsed.path[len(url_netloc_to_root):]
else:
- url_rel_path = target.path
+ # convert to relative to base path
+ url_rel_path = target[len(url_netloc_to_root):]
if url_rel_path == '' or url_rel_path.endswith('/'):
url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE'])
fs_rel_path = fs_relpath_from_url_path(url_rel_path)
target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path)
- if any(re.match(x, target_filename) for x in self.whitelist):
+ if any(re.search(x, target_filename) for x in self.whitelist):
continue
elif target_filename not in self.existing_targets:
if os.path.exists(target_filename):
@@ -197,25 +264,22 @@ class CommandCheck(Command):
self.logger.warn("Broken link in {0}: {1}".format(filename, target))
if find_sources:
self.logger.warn("Possible sources:")
- self.logger.warn(os.popen('nikola list --deps ' + task, 'r').read())
+ self.logger.warn("\n".join(deps[filename]))
self.logger.warn("===============================\n")
except Exception as exc:
self.logger.error("Error with: {0} {1}".format(filename, exc))
return rv
- def scan_links(self, find_sources=False):
+ def scan_links(self, find_sources=False, check_remote=False):
self.logger.info("Checking Links:")
self.logger.info("===============\n")
self.logger.notice("{0} mode".format(self.site.config['URL_TYPE']))
failure = False
- for task in os.popen('nikola list --all', 'r').readlines():
- task = task.strip()
- if task.split(':')[0] in (
- 'render_tags', 'render_archive',
- 'render_galleries', 'render_indexes',
- 'render_pages'
- 'render_site') and '.html' in task:
- if self.analyze(task, find_sources):
+ # Maybe we should just examine all HTML files
+ output_folder = self.site.config['OUTPUT_FOLDER']
+ for fname in _call_nikola_list(self.site)[0]:
+ if fname.startswith(output_folder) and '.html' == fname[-5:]:
+ if self.analyze(fname, find_sources, check_remote):
failure = True
if not failure:
self.logger.info("All links checked.")