diff options
| author | 2024-04-23 00:37:58 -0400 | |
|---|---|---|
| committer | 2024-04-23 00:37:58 -0400 | |
| commit | 9b0e86a8e74768c4fe848fb5ce8d754292db4e3e (patch) | |
| tree | cfd424be8ecb68357e6e572033f08bc534bf724f /nikola/plugins/command/check.py | |
| parent | 393aa58f2c5afd51f92fd9bd4b6dfd0dc90cea41 (diff) | |
New upstream version 8.3.0.upstream/8.3.0upstream
Diffstat (limited to 'nikola/plugins/command/check.py')
| -rw-r--r-- | nikola/plugins/command/check.py | 42 |
1 files changed, 32 insertions, 10 deletions
diff --git a/nikola/plugins/command/check.py b/nikola/plugins/command/check.py index f9b701b..5bcbced 100644 --- a/nikola/plugins/command/check.py +++ b/nikola/plugins/command/check.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright © 2012-2022 Roberto Alsina and others. +# Copyright © 2012-2024 Roberto Alsina and others. # Permission is hereby granted, free of charge, to any # person obtaining a copy of this software and associated @@ -148,6 +148,22 @@ class CommandCheck(Command): 'default': False, 'help': 'Check that remote links work.', }, + { + 'name': 'timeout', + 'long': 'timeout', + 'short': 't', + 'type': int, + 'default': 30, + 'help': 'Timeout (in seconds) for HTTP requests in remote checks.', + }, + { + 'name': 'ignore_query_strings', + 'long': 'ignore-query-strings', + 'short': 'q', + 'type': bool, + 'default': False, + 'help': 'Ignore query strings for internal links.', + } ] def _execute(self, options, args): @@ -160,8 +176,9 @@ class CommandCheck(Command): else: self.logger.level = logging.WARNING failure = False + self.timeout = options['timeout'] if options['links']: - failure |= self.scan_links(options['find_sources'], options['remote']) + failure |= self.scan_links(options['find_sources'], options['remote'], options['ignore_query_strings']) if options['files']: failure |= self.scan_files() if options['clean']: @@ -171,9 +188,10 @@ class CommandCheck(Command): existing_targets = set([]) checked_remote_targets = {} + timeout = None cache = {} - def analyze(self, fname, find_sources=False, check_remote=False): + def analyze(self, fname, find_sources=False, check_remote=False, ignore_query_strings=False): """Analyze links on a page.""" rv = False self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']] @@ -279,19 +297,19 @@ class CommandCheck(Command): # Check the remote link works req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'} # I’m a real boy! - resp = requests.head(target, headers=req_headers, allow_redirects=False) + resp = requests.head(target, headers=req_headers, allow_redirects=False, timeout=self.timeout) # Retry client errors (4xx) as GET requests because many servers are broken if resp.status_code >= 400 and resp.status_code <= 499: time.sleep(0.5) - resp = requests.get(target, headers=req_headers, allow_redirects=False) + resp = requests.get(target, headers=req_headers, allow_redirects=False, timeout=self.timeout) # Follow redirects and see where they lead, redirects to errors will be reported twice if resp.status_code in [301, 302, 307, 308]: redir_status_code = resp.status_code time.sleep(0.5) # Known redirects are retested using GET because IIS servers otherwise get HEADaches - resp = requests.get(target, headers=req_headers, allow_redirects=True) + resp = requests.get(target, headers=req_headers, allow_redirects=True, timeout=self.timeout) # Permanent redirects should be updated if redir_status_code in [301, 308]: self.logger.warning("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code)) @@ -353,6 +371,10 @@ class CommandCheck(Command): else: target_filename_str = target_filename.decode("utf-8", errors="surrogateescape") + if ignore_query_strings and "?" in target_filename_str: + target_filename, _, _ = target_filename.rpartition("?") + target_filename_str, _, _ = target_filename_str.rpartition("?") + if any(pattern.search(target_filename_str) for pattern in self.whitelist): continue @@ -371,7 +393,7 @@ class CommandCheck(Command): self.logger.error(u"Error with: {0} {1}".format(filename, exc)) return rv - def scan_links(self, find_sources=False, check_remote=False): + def scan_links(self, find_sources=False, check_remote=False, ignore_query_strings=False): """Check links on the site.""" self.logger.debug("Checking Links:") self.logger.debug("===============\n") @@ -387,13 +409,13 @@ class CommandCheck(Command): for fname in _call_nikola_list(self.site, self.cache)[0]: if fname.startswith(output_folder): if '.html' == fname[-5:]: - if self.analyze(fname, find_sources, check_remote): + if self.analyze(fname, find_sources, check_remote, ignore_query_strings): failure = True if atom_extension == fname[-len(atom_extension):]: - if self.analyze(fname, find_sources, False): + if self.analyze(fname, find_sources, False, ignore_query_strings): failure = True if fname.endswith('sitemap.xml') or fname.endswith('sitemapindex.xml'): - if self.analyze(fname, find_sources, False): + if self.analyze(fname, find_sources, False, ignore_query_strings): failure = True if not failure: self.logger.debug("All links checked.") |
