diff options
Diffstat (limited to 'sphinx/builders/linkcheck.py')
-rw-r--r-- | sphinx/builders/linkcheck.py | 113 |
1 files changed, 86 insertions, 27 deletions
diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index f250958..9178458 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -7,6 +7,7 @@ import json import re import socket import time +import warnings from html.parser import HTMLParser from os import path from queue import PriorityQueue, Queue @@ -16,29 +17,26 @@ from urllib.parse import unquote, urlparse, urlsplit, urlunparse from docutils import nodes from requests.exceptions import ConnectionError, HTTPError, SSLError, TooManyRedirects +from requests.exceptions import Timeout as RequestTimeout from sphinx.builders.dummy import DummyBuilder +from sphinx.deprecation import RemovedInSphinx80Warning from sphinx.locale import __ from sphinx.transforms.post_transforms import SphinxPostTransform from sphinx.util import encode_uri, logging, requests -from sphinx.util.console import ( # type: ignore[attr-defined] - darkgray, - darkgreen, - purple, - red, - turquoise, -) +from sphinx.util.console import darkgray, darkgreen, purple, red, turquoise from sphinx.util.http_date import rfc1123_to_epoch from sphinx.util.nodes import get_node_line if TYPE_CHECKING: - from collections.abc import Generator, Iterator + from collections.abc import Iterator from typing import Any, Callable from requests import Response from sphinx.application import Sphinx from sphinx.config import Config + from sphinx.util.typing import ExtensionMetadata logger = logging.getLogger(__name__) @@ -56,16 +54,37 @@ class CheckExternalLinksBuilder(DummyBuilder): """ Checks for broken external links. """ + name = 'linkcheck' epilog = __('Look for any errors in the above output or in ' '%(outdir)s/output.txt') def init(self) -> None: self.broken_hyperlinks = 0 + self.timed_out_hyperlinks = 0 self.hyperlinks: dict[str, Hyperlink] = {} # set a timeout for non-responding servers socket.setdefaulttimeout(5.0) + if not self.config.linkcheck_allow_unauthorized: + deprecation_msg = ( + "The default value for 'linkcheck_allow_unauthorized' will change " + "from `True` in Sphinx 7.3+ to `False`, meaning that HTTP 401 " + "unauthorized responses will be reported as broken by default. " + "See https://github.com/sphinx-doc/sphinx/issues/11433 for details." + ) + warnings.warn(deprecation_msg, RemovedInSphinx80Warning, stacklevel=1) + + if self.config.linkcheck_report_timeouts_as_broken: + deprecation_msg = ( + "The default value for 'linkcheck_report_timeouts_as_broken' will change " + 'to False in Sphinx 8, meaning that request timeouts ' + "will be reported with a new 'timeout' status, instead of as 'broken'. " + 'This is intended to provide more detail as to the failure mode. ' + 'See https://github.com/sphinx-doc/sphinx/issues/11868 for details.' + ) + warnings.warn(deprecation_msg, RemovedInSphinx80Warning, stacklevel=1) + def finish(self) -> None: checker = HyperlinkAvailabilityChecker(self.config) logger.info('') @@ -77,7 +96,7 @@ class CheckExternalLinksBuilder(DummyBuilder): for result in checker.check(self.hyperlinks): self.process_result(result) - if self.broken_hyperlinks: + if self.broken_hyperlinks or self.timed_out_hyperlinks: self.app.statuscode = 1 def process_result(self, result: CheckResult) -> None: @@ -104,6 +123,15 @@ class CheckExternalLinksBuilder(DummyBuilder): self.write_entry('local', result.docname, filename, result.lineno, result.uri) elif result.status == 'working': logger.info(darkgreen('ok ') + result.uri + result.message) + elif result.status == 'timeout': + if self.app.quiet or self.app.warningiserror: + logger.warning('timeout ' + result.uri + result.message, + location=(result.docname, result.lineno)) + else: + logger.info(red('timeout ') + result.uri + red(' - ' + result.message)) + self.write_entry('timeout', result.docname, filename, result.lineno, + result.uri + ': ' + result.message) + self.timed_out_hyperlinks += 1 elif result.status == 'broken': if self.app.quiet or self.app.warningiserror: logger.warning(__('broken link: %s (%s)'), result.uri, result.message, @@ -206,7 +234,7 @@ class HyperlinkAvailabilityChecker: self.to_ignore: list[re.Pattern[str]] = list(map(re.compile, self.config.linkcheck_ignore)) - def check(self, hyperlinks: dict[str, Hyperlink]) -> Generator[CheckResult, None, None]: + def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]: self.invoke_threads() total_links = 0 @@ -283,6 +311,11 @@ class HyperlinkAvailabilityCheckWorker(Thread): self.allowed_redirects = config.linkcheck_allowed_redirects self.retries: int = config.linkcheck_retries self.rate_limit_timeout = config.linkcheck_rate_limit_timeout + self._allow_unauthorized = config.linkcheck_allow_unauthorized + if config.linkcheck_report_timeouts_as_broken: + self._timeout_status = 'broken' + else: + self._timeout_status = 'timeout' self.user_agent = config.user_agent self.tls_verify = config.tls_verify @@ -384,7 +417,7 @@ class HyperlinkAvailabilityCheckWorker(Thread): req_url = encode_uri(req_url) # Get auth info, if any - for pattern, auth_info in self.auth: # noqa: B007 (false positive) + for pattern, auth_info in self.auth: # NoQA: B007 (false positive) if pattern.match(uri): break else: @@ -424,6 +457,9 @@ class HyperlinkAvailabilityCheckWorker(Thread): del response break + except RequestTimeout as err: + return self._timeout_status, str(err), 0 + except SSLError as err: # SSL failure; report that the link is broken. return 'broken', str(err), 0 @@ -437,9 +473,31 @@ class HyperlinkAvailabilityCheckWorker(Thread): except HTTPError as err: error_message = str(err) - # Unauthorised: the reference probably exists + # Unauthorized: the client did not provide required credentials if status_code == 401: - return 'working', 'unauthorized', 0 + if self._allow_unauthorized: + deprecation_msg = ( + "\n---\n" + "The linkcheck builder encountered an HTTP 401 " + "(unauthorized) response, and will report it as " + "'working' in this version of Sphinx to maintain " + "backwards-compatibility." + "\n" + "This logic will change in Sphinx 8.0 which will " + "report the hyperlink as 'broken'." + "\n" + "To explicitly continue treating unauthorized " + "hyperlink responses as 'working', set the " + "'linkcheck_allow_unauthorized' config option to " + "``True``." + "\n" + "See https://github.com/sphinx-doc/sphinx/issues/11433 " + "for details." + "\n---" + ) + warnings.warn(deprecation_msg, RemovedInSphinx80Warning, stacklevel=1) + status = 'working' if self._allow_unauthorized else 'broken' + return status, 'unauthorized', 0 # Rate limiting; back-off if allowed, or report failure otherwise if status_code == 429: @@ -534,7 +592,6 @@ def _get_request_headers( def contains_anchor(response: Response, anchor: str) -> bool: """Determine if an anchor is contained within an HTTP response.""" - parser = AnchorCheckParser(unquote(anchor)) # Read file in chunks. If we find a matching anchor, we break # the loop early in hopes not to have to download the whole thing. @@ -607,24 +664,26 @@ def compile_linkcheck_allowed_redirects(app: Sphinx, config: Config) -> None: app.config.linkcheck_allowed_redirects.pop(url) -def setup(app: Sphinx) -> dict[str, Any]: +def setup(app: Sphinx) -> ExtensionMetadata: app.add_builder(CheckExternalLinksBuilder) app.add_post_transform(HyperlinkCollector) - app.add_config_value('linkcheck_ignore', [], False) - app.add_config_value('linkcheck_exclude_documents', [], False) - app.add_config_value('linkcheck_allowed_redirects', {}, False) - app.add_config_value('linkcheck_auth', [], False) - app.add_config_value('linkcheck_request_headers', {}, False) - app.add_config_value('linkcheck_retries', 1, False) - app.add_config_value('linkcheck_timeout', None, False, [int, float]) - app.add_config_value('linkcheck_workers', 5, False) - app.add_config_value('linkcheck_anchors', True, False) + app.add_config_value('linkcheck_ignore', [], '') + app.add_config_value('linkcheck_exclude_documents', [], '') + app.add_config_value('linkcheck_allowed_redirects', {}, '') + app.add_config_value('linkcheck_auth', [], '') + app.add_config_value('linkcheck_request_headers', {}, '') + app.add_config_value('linkcheck_retries', 1, '') + app.add_config_value('linkcheck_timeout', 30, '', (int, float)) + app.add_config_value('linkcheck_workers', 5, '') + app.add_config_value('linkcheck_anchors', True, '') # Anchors starting with ! are ignored since they are # commonly used for dynamic pages - app.add_config_value('linkcheck_anchors_ignore', ['^!'], False) - app.add_config_value('linkcheck_anchors_ignore_for_url', (), False, (tuple, list)) - app.add_config_value('linkcheck_rate_limit_timeout', 300.0, False) + app.add_config_value('linkcheck_anchors_ignore', ['^!'], '') + app.add_config_value('linkcheck_anchors_ignore_for_url', (), '', (tuple, list)) + app.add_config_value('linkcheck_rate_limit_timeout', 300.0, '') + app.add_config_value('linkcheck_allow_unauthorized', True, '') + app.add_config_value('linkcheck_report_timeouts_as_broken', True, '', bool) app.add_event('linkcheck-process-uri') |