aboutsummaryrefslogtreecommitdiffstats
path: root/nikola/plugins/command/check.py
diff options
context:
space:
mode:
Diffstat (limited to 'nikola/plugins/command/check.py')
-rw-r--r--nikola/plugins/command/check.py42
1 files changed, 32 insertions, 10 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py
index f9b701b..5bcbced 100644
--- a/nikola/plugins/command/check.py
+++ b/nikola/plugins/command/check.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright © 2012-2022 Roberto Alsina and others.
+# Copyright © 2012-2024 Roberto Alsina and others.
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
@@ -148,6 +148,22 @@ class CommandCheck(Command):
'default': False,
'help': 'Check that remote links work.',
},
+ {
+ 'name': 'timeout',
+ 'long': 'timeout',
+ 'short': 't',
+ 'type': int,
+ 'default': 30,
+ 'help': 'Timeout (in seconds) for HTTP requests in remote checks.',
+ },
+ {
+ 'name': 'ignore_query_strings',
+ 'long': 'ignore-query-strings',
+ 'short': 'q',
+ 'type': bool,
+ 'default': False,
+ 'help': 'Ignore query strings for internal links.',
+ }
]
def _execute(self, options, args):
@@ -160,8 +176,9 @@ class CommandCheck(Command):
else:
self.logger.level = logging.WARNING
failure = False
+ self.timeout = options['timeout']
if options['links']:
- failure |= self.scan_links(options['find_sources'], options['remote'])
+ failure |= self.scan_links(options['find_sources'], options['remote'], options['ignore_query_strings'])
if options['files']:
failure |= self.scan_files()
if options['clean']:
@@ -171,9 +188,10 @@ class CommandCheck(Command):
existing_targets = set([])
checked_remote_targets = {}
+ timeout = None
cache = {}
- def analyze(self, fname, find_sources=False, check_remote=False):
+ def analyze(self, fname, find_sources=False, check_remote=False, ignore_query_strings=False):
"""Analyze links on a page."""
rv = False
self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']]
@@ -279,19 +297,19 @@ class CommandCheck(Command):
# Check the remote link works
req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'} # I’m a real boy!
- resp = requests.head(target, headers=req_headers, allow_redirects=False)
+ resp = requests.head(target, headers=req_headers, allow_redirects=False, timeout=self.timeout)
# Retry client errors (4xx) as GET requests because many servers are broken
if resp.status_code >= 400 and resp.status_code <= 499:
time.sleep(0.5)
- resp = requests.get(target, headers=req_headers, allow_redirects=False)
+ resp = requests.get(target, headers=req_headers, allow_redirects=False, timeout=self.timeout)
# Follow redirects and see where they lead, redirects to errors will be reported twice
if resp.status_code in [301, 302, 307, 308]:
redir_status_code = resp.status_code
time.sleep(0.5)
# Known redirects are retested using GET because IIS servers otherwise get HEADaches
- resp = requests.get(target, headers=req_headers, allow_redirects=True)
+ resp = requests.get(target, headers=req_headers, allow_redirects=True, timeout=self.timeout)
# Permanent redirects should be updated
if redir_status_code in [301, 308]:
self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
@@ -353,6 +371,10 @@ class CommandCheck(Command):
else:
target_filename_str = target_filename.decode("utf-8", errors="surrogateescape")
+ if ignore_query_strings and "?" in target_filename_str:
+ target_filename, _, _ = target_filename.rpartition("?")
+ target_filename_str, _, _ = target_filename_str.rpartition("?")
+
if any(pattern.search(target_filename_str) for pattern in self.whitelist):
continue
@@ -371,7 +393,7 @@ class CommandCheck(Command):
self.logger.error(u"Error with: {0} {1}".format(filename, exc))
return rv
- def scan_links(self, find_sources=False, check_remote=False):
+ def scan_links(self, find_sources=False, check_remote=False, ignore_query_strings=False):
"""Check links on the site."""
self.logger.debug("Checking Links:")
self.logger.debug("===============\n")
@@ -387,13 +409,13 @@ class CommandCheck(Command):
for fname in _call_nikola_list(self.site, self.cache)[0]:
if fname.startswith(output_folder):
if '.html' == fname[-5:]:
- if self.analyze(fname, find_sources, check_remote):
+ if self.analyze(fname, find_sources, check_remote, ignore_query_strings):
failure = True
if atom_extension == fname[-len(atom_extension):]:
- if self.analyze(fname, find_sources, False):
+ if self.analyze(fname, find_sources, False, ignore_query_strings):
failure = True
if fname.endswith('sitemap.xml') or fname.endswith('sitemapindex.xml'):
- if self.analyze(fname, find_sources, False):
+ if self.analyze(fname, find_sources, False, ignore_query_strings):
failure = True
if not failure:
self.logger.debug("All links checked.")