diff options
Diffstat (limited to 'nikola/plugins/command/check.py')
| -rw-r--r-- | nikola/plugins/command/check.py | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py new file mode 100644 index 0000000..5c7e49a --- /dev/null +++ b/nikola/plugins/command/check.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2012-2013 Roberto Alsina and others. + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import print_function +import os +import re +import sys +try: + from urllib import unquote + from urlparse import urlparse +except ImportError: + from urllib.parse import unquote, urlparse # NOQA + +import lxml.html + +from nikola.plugin_categories import Command +from nikola.utils import get_logger + + +class CommandCheck(Command): + """Check the generated site.""" + + name = "check" + logger = None + + doc_usage = "-l [--find-sources] | -f" + doc_purpose = "check links and files in the generated site" + cmd_options = [ + { + 'name': 'links', + 'short': 'l', + 'long': 'check-links', + 'type': bool, + 'default': False, + 'help': 'Check for dangling links', + }, + { + 'name': 'files', + 'short': 'f', + 'long': 'check-files', + 'type': bool, + 'default': False, + 'help': 'Check for unknown files', + }, + { + 'name': 'clean', + 'long': 'clean-files', + 'type': bool, + 'default': False, + 'help': 'Remove all unknown files, use with caution', + }, + { + 'name': 'find_sources', + 'long': 'find-sources', + 'type': bool, + 'default': False, + 'help': 'List possible source files for files with broken links.', + }, + ] + + def _execute(self, options, args): + """Check the generated site.""" + + self.logger = get_logger('check', self.site.loghandlers) + + if not options['links'] and not options['files'] and not options['clean']: + print(self.help()) + return False + if options['links']: + failure = self.scan_links(options['find_sources']) + if options['files']: + failure = self.scan_files() + if options['clean']: + failure = self.clean_files() + if failure: + sys.exit(1) + + existing_targets = set([]) + + def analyze(self, task, find_sources=False): + rv = False + self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] + try: + filename = task.split(":")[-1] + d = lxml.html.fromstring(open(filename).read()) + for l in d.iterlinks(): + target = l[0].attrib[l[1]] + if target == "#": + continue + parsed = urlparse(target) + if parsed.scheme or target.startswith('//'): + continue + if parsed.fragment: + target = target.split('#')[0] + target_filename = os.path.abspath( + os.path.join(os.path.dirname(filename), unquote(target))) + if any(re.match(x, target_filename) for x in self.whitelist): + continue + elif target_filename not in self.existing_targets: + if os.path.exists(target_filename): + self.existing_targets.add(target_filename) + else: + rv = True + self.logger.warn("Broken link in {0}: ".format(filename), target) + if find_sources: + self.logger.warn("Possible sources:") + self.logger.warn(os.popen('nikola list --deps ' + task, 'r').read()) + self.logger.warn("===============================\n") + except Exception as exc: + self.logger.error("Error with:", filename, exc) + return rv + + def scan_links(self, find_sources=False): + self.logger.notice("Checking Links:") + self.logger.notice("===============") + failure = False + for task in os.popen('nikola list --all', 'r').readlines(): + task = task.strip() + if task.split(':')[0] in ( + 'render_tags', 'render_archive', + 'render_galleries', 'render_indexes', + 'render_pages' + 'render_site') and '.html' in task: + if self.analyze(task, find_sources): + failure = True + if not failure: + self.logger.notice("All links checked.") + return failure + + def scan_files(self): + failure = False + self.logger.notice("Checking Files:") + self.logger.notice("===============\n") + only_on_output, only_on_input = self.real_scan_files() + + # Ignore folders + only_on_output = [p for p in only_on_output if not os.path.isdir(p)] + only_on_input = [p for p in only_on_input if not os.path.isdir(p)] + + if only_on_output: + only_on_output.sort() + self.logger.warn("Files from unknown origins:") + for f in only_on_output: + self.logger.warn(f) + failure = True + if only_on_input: + only_on_input.sort() + self.logger.warn("Files not generated:") + for f in only_on_input: + self.logger.warn(f) + if not failure: + self.logger.notice("All files checked.") + return failure + + def clean_files(self): + only_on_output, _ = self.real_scan_files() + for f in only_on_output: + os.unlink(f) + return True + + def real_scan_files(self): + task_fnames = set([]) + real_fnames = set([]) + output_folder = self.site.config['OUTPUT_FOLDER'] + # First check that all targets are generated in the right places + for task in os.popen('nikola list --all', 'r').readlines(): + task = task.strip() + if output_folder in task and ':' in task: + fname = task.split(':', 1)[-1] + task_fnames.add(fname) + # And now check that there are no non-target files + for root, dirs, files in os.walk(output_folder): + for src_name in files: + fname = os.path.join(root, src_name) + real_fnames.add(fname) + + only_on_output = list(real_fnames - task_fnames) + + only_on_input = list(task_fnames - real_fnames) + + return (only_on_output, only_on_input) |
