1 files changed, 60 insertions, 44 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py
index 0141a6b..cac6000 100644
--- a/nikola/plugins/command/check.py
+++ b/nikola/plugins/command/check.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright © 2012-2016 Roberto Alsina and others.
+# Copyright © 2012-2020 Roberto Alsina and others.
 
 # Permission is hereby granted, free of charge, to any
 # person obtaining a copy of this software and associated
@@ -26,25 +26,19 @@
 
 """Check the generated site."""
 
-from __future__ import print_function
-from collections import defaultdict
+import logging
 import os
 import re
 import sys
 import time
-import logbook
-try:
-    from urllib import unquote
-    from urlparse import urlparse, urljoin, urldefrag
-except ImportError:
-    from urllib.parse import unquote, urlparse, urljoin, urldefrag  # NOQA
+from collections import defaultdict
+from urllib.parse import unquote, urlparse, urljoin, urldefrag
 
-from doit.loader import generate_tasks
 import lxml.html
 import requests
+from doit.loader import generate_tasks
 
 from nikola.plugin_categories import Command
-from nikola.utils import get_logger, STDERR_HANDLER
 
 
 def _call_nikola_list(site, cache=None):
@@ -104,7 +98,6 @@ class CommandCheck(Command):
     """Check the generated site."""
 
     name = "check"
-    logger = None
 
     doc_usage = "[-v] (-l [--find-sources] [-r] | -f [--clean-files])"
     doc_purpose = "check links and files in the generated site"
@@ -159,15 +152,13 @@ class CommandCheck(Command):
 
     def _execute(self, options, args):
         """Check the generated site."""
-        self.logger = get_logger('check', STDERR_HANDLER)
-
         if not options['links'] and not options['files'] and not options['clean']:
             print(self.help())
-            return False
+            return 1
         if options['verbose']:
-            self.logger.level = logbook.DEBUG
+            self.logger.level = logging.DEBUG
         else:
-            self.logger.level = logbook.NOTICE
+            self.logger.level = logging.WARNING
         failure = False
         if options['links']:
             failure |= self.scan_links(options['find_sources'], options['remote'])
@@ -191,6 +182,7 @@ class CommandCheck(Command):
         self.existing_targets.add(self.site.config['SITE_URL'])
         self.existing_targets.add(self.site.config['BASE_URL'])
         url_type = self.site.config['URL_TYPE']
+        atom_extension = self.site.config['ATOM_EXTENSION']
 
         deps = {}
         if find_sources:
@@ -205,7 +197,7 @@ class CommandCheck(Command):
                 # Do not look at links in the cache, which are not parsed by
                 # anyone and may result in false positives.  Problems arise
                 # with galleries, for example.  Full rationale: (Issue #1447)
-                self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename))
+                self.logger.warning("Ignoring {0} (in cache, links may be incorrect)".format(filename))
                 return False
 
             if not os.path.exists(fname):
@@ -213,7 +205,8 @@ class CommandCheck(Command):
                 return False
 
             if '.html' == fname[-5:]:
-                d = lxml.html.fromstring(open(filename, 'rb').read())
+                with open(filename, 'rb') as inf:
+                    d = lxml.html.fromstring(inf.read())
                 extra_objs = lxml.html.fromstring('<html/>')
 
                 # Turn elements with a srcset attribute into individual img elements with src attributes
@@ -223,7 +216,7 @@ class CommandCheck(Command):
                             extra_objs.append(lxml.etree.Element('img', src=srcset_item.strip().split(' ')[0]))
                 link_elements = list(d.iterlinks()) + list(extra_objs.iterlinks())
             # Extract links from XML formats to minimal HTML, allowing those to go through the link checks
-            elif '.atom' == filename[-5:]:
+            elif atom_extension == filename[-len(atom_extension):]:
                 d = lxml.etree.parse(filename)
                 link_elements = lxml.html.fromstring('<html/>')
                 for elm in d.findall('*//{http://www.w3.org/2005/Atom}link'):
@@ -257,13 +250,13 @@ class CommandCheck(Command):
 
                 # Warn about links from https to http (mixed-security)
                 if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http":
-                    self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target))
+                    self.logger.warning("Mixed-content security for link in {0}: {1}".format(filename, target))
 
                 # Link to an internal REDIRECTIONS page
                 if target in self.internal_redirects:
                     redir_status_code = 301
                     redir_target = [_dest for _target, _dest in self.site.config['REDIRECTIONS'] if urljoin('/', _target) == target][0]
-                    self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: 301]".format(redir_target, filename, target))
+                    self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: 301]".format(redir_target, filename, target))
 
                 # Absolute links to other domains, skip
                 # Absolute links when using only paths, skip.
@@ -273,7 +266,7 @@ class CommandCheck(Command):
                         continue
                     if target in self.checked_remote_targets:  # already checked this exact target
                         if self.checked_remote_targets[target] in [301, 308]:
-                            self.logger.warn("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target]))
+                            self.logger.warning("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target]))
                         elif self.checked_remote_targets[target] in [302, 307]:
                             self.logger.debug("Remote link temporarily redirected in {0}: {1} [HTTP: {2}]".format(filename, target, self.checked_remote_targets[target]))
                         elif self.checked_remote_targets[target] > 399:
@@ -281,7 +274,7 @@ class CommandCheck(Command):
                         continue
 
                     # Skip whitelisted targets
-                    if any(re.search(_, target) for _ in self.whitelist):
+                    if any(pattern.search(target) for pattern in self.whitelist):
                         continue
 
                     # Check the remote link works
@@ -301,7 +294,7 @@ class CommandCheck(Command):
                         resp = requests.get(target, headers=req_headers, allow_redirects=True)
                         # Permanent redirects should be updated
                         if redir_status_code in [301, 308]:
-                            self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
+                            self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
                         if redir_status_code in [302, 307]:
                             self.logger.debug("Remote link temporarily redirected to \"{0}\" in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
                         self.checked_remote_targets[resp.url] = resp.status_code
@@ -315,7 +308,7 @@ class CommandCheck(Command):
                     elif resp.status_code <= 399:  # The address leads *somewhere* that is not an error
                         self.logger.debug("Successfully checked remote link in {0}: {1} [HTTP: {2}]".format(filename, target, resp.status_code))
                         continue
-                    self.logger.warn("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target))
+                    self.logger.warning("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target))
                     continue
 
                 if url_type == 'rel_path':
@@ -323,23 +316,44 @@ class CommandCheck(Command):
                         target_filename = os.path.abspath(
                             os.path.join(self.site.config['OUTPUT_FOLDER'], unquote(target.lstrip('/'))))
                     else:  # Relative path
-                        unquoted_target = unquote(target).encode('utf-8') if sys.version_info.major >= 3 else unquote(target).decode('utf-8')
+                        unquoted_target = unquote(target).encode('utf-8')
                         target_filename = os.path.abspath(
                             os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target))
 
-                elif url_type in ('full_path', 'absolute'):
+                else:
+                    relative = False
                     if url_type == 'absolute':
                         # convert to 'full_path' case, ie url relative to root
-                        url_rel_path = parsed.path[len(url_netloc_to_root):]
+                        if parsed.path.startswith(url_netloc_to_root):
+                            url_rel_path = parsed.path[len(url_netloc_to_root):]
+                        else:
+                            url_rel_path = parsed.path
+                            if not url_rel_path.startswith('/'):
+                                relative = True
                     else:
                         # convert to relative to base path
-                        url_rel_path = target[len(url_netloc_to_root):]
+                        if target.startswith(url_netloc_to_root):
+                            url_rel_path = target[len(url_netloc_to_root):]
+                        else:
+                            url_rel_path = target
+                            if not url_rel_path.startswith('/'):
+                                relative = True
                     if url_rel_path == '' or url_rel_path.endswith('/'):
                         url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE'])
-                    fs_rel_path = fs_relpath_from_url_path(url_rel_path)
-                    target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path)
+                    if relative:
+                        unquoted_target = unquote(target).encode('utf-8')
+                        target_filename = os.path.abspath(
+                            os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target))
+                    else:
+                        fs_rel_path = fs_relpath_from_url_path(url_rel_path)
+                        target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path)
+
+                if isinstance(target_filename, str):
+                    target_filename_str = target_filename
+                else:
+                    target_filename_str = target_filename.decode("utf-8", errors="surrogateescape")
 
-                if any(re.search(x, target_filename) for x in self.whitelist):
+                if any(pattern.search(target_filename_str) for pattern in self.whitelist):
                     continue
 
                 elif target_filename not in self.existing_targets:
@@ -348,11 +362,11 @@ class CommandCheck(Command):
                         self.existing_targets.add(target_filename)
                     else:
                         rv = True
-                        self.logger.warn("Broken link in {0}: {1}".format(filename, target))
+                        self.logger.warning("Broken link in {0}: {1}".format(filename, target))
                         if find_sources:
-                            self.logger.warn("Possible sources:")
-                            self.logger.warn("\n".join(deps[filename]))
-                            self.logger.warn("===============================\n")
+                            self.logger.warning("Possible sources:")
+                            self.logger.warning("\n".join(deps[filename]))
+                            self.logger.warning("===============================\n")
         except Exception as exc:
             self.logger.error(u"Error with: {0} {1}".format(filename, exc))
         return rv
@@ -363,6 +377,7 @@ class CommandCheck(Command):
         self.logger.debug("===============\n")
         self.logger.debug("{0} mode".format(self.site.config['URL_TYPE']))
         failure = False
+        atom_extension = self.site.config['ATOM_EXTENSION']
         # Maybe we should just examine all HTML files
         output_folder = self.site.config['OUTPUT_FOLDER']
 
@@ -374,7 +389,7 @@ class CommandCheck(Command):
                 if '.html' == fname[-5:]:
                     if self.analyze(fname, find_sources, check_remote):
                         failure = True
-                if '.atom' == fname[-5:]:
+                if atom_extension == fname[-len(atom_extension):]:
                     if self.analyze(fname, find_sources, False):
                         failure = True
                 if fname.endswith('sitemap.xml') or fname.endswith('sitemapindex.xml'):
@@ -397,15 +412,15 @@ class CommandCheck(Command):
 
         if only_on_output:
             only_on_output.sort()
-            self.logger.warn("Files from unknown origins (orphans):")
+            self.logger.warning("Files from unknown origins (orphans):")
             for f in only_on_output:
-                self.logger.warn(f)
+                self.logger.warning(f)
             failure = True
         if only_on_input:
             only_on_input.sort()
-            self.logger.warn("Files not generated:")
+            self.logger.warning("Files not generated:")
             for f in only_on_input:
-                self.logger.warn(f)
+                self.logger.warning(f)
         if not failure:
             self.logger.debug("All files checked.")
         return failure
@@ -434,6 +449,7 @@ class CommandCheck(Command):
                 pass
 
         if warn_flag:
-            self.logger.warn('Some files or directories have been removed, your site may need rebuilding')
+            self.logger.warning('Some files or directories have been removed, your site may need rebuilding')
+            return True
 
-        return True
+        return False