diff options
Diffstat (limited to 'nikola/plugins/command/check.py')
| -rw-r--r-- | nikola/plugins/command/check.py | 70 |
1 files changed, 62 insertions, 8 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py index a9bc44a..abf183e 100644 --- a/nikola/plugins/command/check.py +++ b/nikola/plugins/command/check.py @@ -24,11 +24,14 @@ # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +"""Check the generated site.""" + from __future__ import print_function from collections import defaultdict import os import re import sys +import time try: from urllib import unquote from urlparse import urlparse, urljoin, urldefrag @@ -40,7 +43,7 @@ import lxml.html import requests from nikola.plugin_categories import Command -from nikola.utils import get_logger +from nikola.utils import get_logger, STDERR_HANDLER def _call_nikola_list(site): @@ -58,6 +61,7 @@ def _call_nikola_list(site): def real_scan_files(site): + """Scan for files.""" task_fnames = set([]) real_fnames = set([]) output_folder = site.config['OUTPUT_FOLDER'] @@ -80,7 +84,8 @@ def real_scan_files(site): def fs_relpath_from_url_path(url_path): - """Expects as input an urlparse(s).path""" + """Create a filesystem relative path from an URL path.""" + # Expects as input an urlparse(s).path url_path = unquote(url_path) # in windows relative paths don't begin with os.sep if sys.platform == 'win32' and len(url_path): @@ -89,6 +94,7 @@ def fs_relpath_from_url_path(url_path): class CommandCheck(Command): + """Check the generated site.""" name = "check" @@ -147,7 +153,7 @@ class CommandCheck(Command): def _execute(self, options, args): """Check the generated site.""" - self.logger = get_logger('check', self.site.loghandlers) + self.logger = get_logger('check', STDERR_HANDLER) if not options['links'] and not options['files'] and not options['clean']: print(self.help()) @@ -169,6 +175,7 @@ class CommandCheck(Command): checked_remote_targets = {} def analyze(self, fname, find_sources=False, check_remote=False): + """Analyze links on a page.""" rv = False self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] base_url = urlparse(self.site.config['BASE_URL']) @@ -217,15 +224,45 @@ class CommandCheck(Command): if parsed.netloc == base_url.netloc: # absolute URL to self.site continue if target in self.checked_remote_targets: # already checked this exact target - if self.checked_remote_targets[target] > 399: - self.logger.warn("Broken link in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) + if self.checked_remote_targets[target] in [301, 307]: + self.logger.warn("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) + elif self.checked_remote_targets[target] in [302, 308]: + self.logger.info("Remote link temporarily redirected in {1}: {2} [HTTP: {3}]".format(filename, target, self.checked_remote_targets[target])) + elif self.checked_remote_targets[target] > 399: + self.logger.error("Broken link in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target])) continue + + # Skip whitelisted targets + if any(re.search(_, target) for _ in self.whitelist): + continue + # Check the remote link works req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'} # I’m a real boy! - resp = requests.head(target, headers=req_headers) - self.checked_remote_targets[target] = resp.status_code + resp = requests.head(target, headers=req_headers, allow_redirects=False) + + # Retry client errors (4xx) as GET requests because many servers are broken + if resp.status_code >= 400 and resp.status_code <= 499: + time.sleep(0.5) + resp = requests.get(target, headers=req_headers, allow_redirects=False) + + # Follow redirects and see where they lead, redirects to errors will be reported twice + if resp.status_code in [301, 302, 307, 308]: + redir_status_code = resp.status_code + time.sleep(0.5) + # Known redirects are retested using GET because IIS servers otherwise get HEADaches + resp = requests.get(target, headers=req_headers, allow_redirects=True) + # Permanent redirects should be updated + if redir_status_code in [301, 308]: + self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) + if redir_status_code in [302, 307]: + self.logger.info("Remote link temporarily redirected to \"{0}\" in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) + self.checked_remote_targets[resp.url] = resp.status_code + self.checked_remote_targets[target] = redir_status_code + else: + self.checked_remote_targets[target] = resp.status_code + if resp.status_code > 399: # Error - self.logger.warn("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code)) + self.logger.error("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code)) continue elif resp.status_code <= 399: # The address leads *somewhere* that is not an error self.logger.debug("Successfully checked remote link in {0}: {1} [HTTP: {2}]".format(filename, target, resp.status_code)) @@ -271,6 +308,7 @@ class CommandCheck(Command): return rv def scan_links(self, find_sources=False, check_remote=False): + """Check links on the site.""" self.logger.info("Checking Links:") self.logger.info("===============\n") self.logger.notice("{0} mode".format(self.site.config['URL_TYPE'])) @@ -286,6 +324,7 @@ class CommandCheck(Command): return failure def scan_files(self): + """Check files in the site, find missing and orphaned files.""" failure = False self.logger.info("Checking Files:") self.logger.info("===============\n") @@ -311,7 +350,22 @@ class CommandCheck(Command): return failure def clean_files(self): + """Remove orphaned files.""" only_on_output, _ = real_scan_files(self.site) for f in only_on_output: + self.logger.info('removed: {0}'.format(f)) os.unlink(f) + + # Find empty directories and remove them + output_folder = self.site.config['OUTPUT_FOLDER'] + all_dirs = [] + for root, dirs, files in os.walk(output_folder, followlinks=True): + all_dirs.append(root) + all_dirs.sort(key=len, reverse=True) + for d in all_dirs: + try: + os.rmdir(d) + self.logger.info('removed: {0}/'.format(d)) + except OSError: + pass return True |
