diff options
Diffstat (limited to 'nikola/plugins/command/check.py')
| -rw-r--r-- | nikola/plugins/command/check.py | 104 |
1 files changed, 60 insertions, 44 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py index 0141a6b..cac6000 100644 --- a/nikola/plugins/command/check.py +++ b/nikola/plugins/command/check.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2016 Roberto Alsina and others. +# Copyright © 2012-2020 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -26,25 +26,19 @@ """Check the generated site.""" -from __future__ import print_function -from collections import defaultdict +import logging import os import re import sys import time -import logbook -try: - from urllib import unquote - from urlparse import urlparse, urljoin, urldefrag -except ImportError: - from urllib.parse import unquote, urlparse, urljoin, urldefrag # NOQA +from collections import defaultdict +from urllib.parse import unquote, urlparse, urljoin, urldefrag -from doit.loader import generate_tasks import lxml.html import requests +from doit.loader import generate_tasks from nikola.plugin_categories import Command -from nikola.utils import get_logger, STDERR_HANDLER def _call_nikola_list(site, cache=None): @@ -104,7 +98,6 @@ class CommandCheck(Command): """Check the generated site.""" name = "check" - logger = None doc_usage = "[-v] (-l [--find-sources] [-r] | -f [--clean-files])" doc_purpose = "check links and files in the generated site" @@ -159,15 +152,13 @@ class CommandCheck(Command): def _execute(self, options, args): """Check the generated site.""" - self.logger = get_logger('check', STDERR_HANDLER) - if not options['links'] and not options['files'] and not options['clean']: print(self.help()) - return False + return 1 if options['verbose']: - self.logger.level = logbook.DEBUG + self.logger.level = logging.DEBUG else: - self.logger.level = logbook.NOTICE + self.logger.level = logging.WARNING failure = False if options['links']: failure |= self.scan_links(options['find_sources'], options['remote']) @@ -191,6 +182,7 @@ class CommandCheck(Command): self.existing_targets.add(self.site.config['SITE_URL']) self.existing_targets.add(self.site.config['BASE_URL']) url_type = self.site.config['URL_TYPE'] + atom_extension = self.site.config['ATOM_EXTENSION'] deps = {} if find_sources: @@ -205,7 +197,7 @@ class CommandCheck(Command): # Do not look at links in the cache, which are not parsed by # anyone and may result in false positives. Problems arise # with galleries, for example. Full rationale: (Issue #1447) - self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename)) + self.logger.warning("Ignoring {0} (in cache, links may be incorrect)".format(filename)) return False if not os.path.exists(fname): @@ -213,7 +205,8 @@ class CommandCheck(Command): return False if '.html' == fname[-5:]: - d = lxml.html.fromstring(open(filename, 'rb').read()) + with open(filename, 'rb') as inf: + d = lxml.html.fromstring(inf.read()) extra_objs = lxml.html.fromstring('<html/>') # Turn elements with a srcset attribute into individual img elements with src attributes @@ -223,7 +216,7 @@ class CommandCheck(Command): extra_objs.append(lxml.etree.Element('img', src=srcset_item.strip().split(' ')[0])) link_elements = list(d.iterlinks()) + list(extra_objs.iterlinks()) # Extract links from XML formats to minimal HTML, allowing those to go through the link checks - elif '.atom' == filename[-5:]: + elif atom_extension == filename[-len(atom_extension):]: d = lxml.etree.parse(filename) link_elements = lxml.html.fromstring('<html/>') for elm in d.findall('*//{http://www.w3.org/2005/Atom}link'): @@ -257,13 +250,13 @@ class CommandCheck(Command): # Warn about links from https to http (mixed-security) if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http": - self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target)) + self.logger.warning("Mixed-content security for link in {0}: {1}".format(filename, target)) # Link to an internal REDIRECTIONS page if target in self.internal_redirects: redir_status_code = 301 redir_target = [_dest for _target, _dest in self.site.config['REDIRECTIONS'] if urljoin('/', _target) == target][0] - self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: 301]".format(redir_target, filename, target)) + self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: 301]".format(redir_target, filename, target)) # Absolute links to other domains, skip # Absolute links when using only paths, skip. @@ -273,7 +266,7 @@ class CommandCheck(Command): continue if target in self.checked_remote_targets: # already checked this exact target if self.checked_remote_targets[target] in [301, 308]: - self.logger.warn("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) + self.logger.warning("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) elif self.checked_remote_targets[target] in [302, 307]: self.logger.debug("Remote link temporarily redirected in {0}: {1} [HTTP: {2}]".format(filename, target, self.checked_remote_targets[target])) elif self.checked_remote_targets[target] > 399: @@ -281,7 +274,7 @@ class CommandCheck(Command): continue # Skip whitelisted targets - if any(re.search(_, target) for _ in self.whitelist): + if any(pattern.search(target) for pattern in self.whitelist): continue # Check the remote link works @@ -301,7 +294,7 @@ class CommandCheck(Command): resp = requests.get(target, headers=req_headers, allow_redirects=True) # Permanent redirects should be updated if redir_status_code in [301, 308]: - self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) + self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) if redir_status_code in [302, 307]: self.logger.debug("Remote link temporarily redirected to \"{0}\" in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) self.checked_remote_targets[resp.url] = resp.status_code @@ -315,7 +308,7 @@ class CommandCheck(Command): elif resp.status_code <= 399: # The address leads *somewhere* that is not an error self.logger.debug("Successfully checked remote link in {0}: {1} [HTTP: {2}]".format(filename, target, resp.status_code)) continue - self.logger.warn("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target)) + self.logger.warning("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target)) continue if url_type == 'rel_path': @@ -323,23 +316,44 @@ class CommandCheck(Command): target_filename = os.path.abspath( os.path.join(self.site.config['OUTPUT_FOLDER'], unquote(target.lstrip('/')))) else: # Relative path - unquoted_target = unquote(target).encode('utf-8') if sys.version_info.major >= 3 else unquote(target).decode('utf-8') + unquoted_target = unquote(target).encode('utf-8') target_filename = os.path.abspath( os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target)) - elif url_type in ('full_path', 'absolute'): + else: + relative = False if url_type == 'absolute': # convert to 'full_path' case, ie url relative to root - url_rel_path = parsed.path[len(url_netloc_to_root):] + if parsed.path.startswith(url_netloc_to_root): + url_rel_path = parsed.path[len(url_netloc_to_root):] + else: + url_rel_path = parsed.path + if not url_rel_path.startswith('/'): + relative = True else: # convert to relative to base path - url_rel_path = target[len(url_netloc_to_root):] + if target.startswith(url_netloc_to_root): + url_rel_path = target[len(url_netloc_to_root):] + else: + url_rel_path = target + if not url_rel_path.startswith('/'): + relative = True if url_rel_path == '' or url_rel_path.endswith('/'): url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE']) - fs_rel_path = fs_relpath_from_url_path(url_rel_path) - target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path) + if relative: + unquoted_target = unquote(target).encode('utf-8') + target_filename = os.path.abspath( + os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target)) + else: + fs_rel_path = fs_relpath_from_url_path(url_rel_path) + target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path) + + if isinstance(target_filename, str): + target_filename_str = target_filename + else: + target_filename_str = target_filename.decode("utf-8", errors="surrogateescape") - if any(re.search(x, target_filename) for x in self.whitelist): + if any(pattern.search(target_filename_str) for pattern in self.whitelist): continue elif target_filename not in self.existing_targets: @@ -348,11 +362,11 @@ class CommandCheck(Command): self.existing_targets.add(target_filename) else: rv = True - self.logger.warn("Broken link in {0}: {1}".format(filename, target)) + self.logger.warning("Broken link in {0}: {1}".format(filename, target)) if find_sources: - self.logger.warn("Possible sources:") - self.logger.warn("\n".join(deps[filename])) - self.logger.warn("===============================\n") + self.logger.warning("Possible sources:") + self.logger.warning("\n".join(deps[filename])) + self.logger.warning("===============================\n") except Exception as exc: self.logger.error(u"Error with: {0} {1}".format(filename, exc)) return rv @@ -363,6 +377,7 @@ class CommandCheck(Command): self.logger.debug("===============\n") self.logger.debug("{0} mode".format(self.site.config['URL_TYPE'])) failure = False + atom_extension = self.site.config['ATOM_EXTENSION'] # Maybe we should just examine all HTML files output_folder = self.site.config['OUTPUT_FOLDER'] @@ -374,7 +389,7 @@ class CommandCheck(Command): if '.html' == fname[-5:]: if self.analyze(fname, find_sources, check_remote): failure = True - if '.atom' == fname[-5:]: + if atom_extension == fname[-len(atom_extension):]: if self.analyze(fname, find_sources, False): failure = True if fname.endswith('sitemap.xml') or fname.endswith('sitemapindex.xml'): @@ -397,15 +412,15 @@ class CommandCheck(Command): if only_on_output: only_on_output.sort() - self.logger.warn("Files from unknown origins (orphans):") + self.logger.warning("Files from unknown origins (orphans):") for f in only_on_output: - self.logger.warn(f) + self.logger.warning(f) failure = True if only_on_input: only_on_input.sort() - self.logger.warn("Files not generated:") + self.logger.warning("Files not generated:") for f in only_on_input: - self.logger.warn(f) + self.logger.warning(f) if not failure: self.logger.debug("All files checked.") return failure @@ -434,6 +449,7 @@ class CommandCheck(Command): pass if warn_flag: - self.logger.warn('Some files or directories have been removed, your site may need rebuilding') + self.logger.warning('Some files or directories have been removed, your site may need rebuilding') + return True - return True + return False |
