summaryrefslogtreecommitdiffstats
path: root/yt_dlp/networking/_requests.py
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/networking/_requests.py')
-rw-r--r--yt_dlp/networking/_requests.py408
1 files changed, 408 insertions, 0 deletions
diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py
new file mode 100644
index 0000000..6545028
--- /dev/null
+++ b/yt_dlp/networking/_requests.py
@@ -0,0 +1,408 @@
+import contextlib
+import functools
+import http.client
+import logging
+import re
+import socket
+import warnings
+
+from ..dependencies import brotli, requests, urllib3
+from ..utils import bug_reports_message, int_or_none, variadic
+from ..utils.networking import normalize_url
+
+if requests is None:
+ raise ImportError('requests module is not installed')
+
+if urllib3 is None:
+ raise ImportError('urllib3 module is not installed')
+
+urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
+
+if urllib3_version < (1, 26, 17):
+ raise ImportError('Only urllib3 >= 1.26.17 is supported')
+
+if requests.__build__ < 0x023100:
+ raise ImportError('Only requests >= 2.31.0 is supported')
+
+import requests.adapters
+import requests.utils
+import urllib3.connection
+import urllib3.exceptions
+
+from ._helper import (
+ InstanceStoreMixin,
+ add_accept_encoding_header,
+ create_connection,
+ create_socks_proxy_socket,
+ get_redirect_method,
+ make_socks_proxy_opts,
+ select_proxy,
+)
+from .common import (
+ Features,
+ RequestHandler,
+ Response,
+ register_preference,
+ register_rh,
+)
+from .exceptions import (
+ CertificateVerifyError,
+ HTTPError,
+ IncompleteRead,
+ ProxyError,
+ RequestError,
+ SSLError,
+ TransportError,
+)
+from ..socks import ProxyError as SocksProxyError
+
+SUPPORTED_ENCODINGS = [
+ 'gzip', 'deflate'
+]
+
+if brotli is not None:
+ SUPPORTED_ENCODINGS.append('br')
+
+"""
+Override urllib3's behavior to not convert lower-case percent-encoded characters
+to upper-case during url normalization process.
+
+RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
+and normalizers should convert them to uppercase for consistency [1].
+
+However, some sites may have an incorrect implementation where they provide
+a percent-encoded url that is then compared case-sensitively.[2]
+
+While this is a very rare case, since urllib does not do this normalization step, it
+is best to avoid it in requests too for compatability reasons.
+
+1: https://tools.ietf.org/html/rfc3986#section-2.1
+2: https://github.com/streamlink/streamlink/pull/4003
+"""
+
+
+class Urllib3PercentREOverride:
+ def __init__(self, r: re.Pattern):
+ self.re = r
+
+ # pass through all other attribute calls to the original re
+ def __getattr__(self, item):
+ return self.re.__getattribute__(item)
+
+ def subn(self, repl, string, *args, **kwargs):
+ return string, self.re.subn(repl, string, *args, **kwargs)[1]
+
+
+# urllib3 >= 1.25.8 uses subn:
+# https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
+import urllib3.util.url # noqa: E305
+
+if hasattr(urllib3.util.url, 'PERCENT_RE'):
+ urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
+elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0
+ urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
+else:
+ warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
+
+"""
+Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
+server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
+however this is an issue because we set check_hostname to True in our SSLContext.
+
+Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
+
+This has been fixed in urllib3 2.0+.
+See: https://github.com/urllib3/urllib3/issues/517
+"""
+
+if urllib3_version < (2, 0, 0):
+ with contextlib.suppress(Exception):
+ urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
+
+
+# Requests will not automatically handle no_proxy by default
+# due to buggy no_proxy handling with proxy dict [1].
+# 1. https://github.com/psf/requests/issues/5000
+requests.adapters.select_proxy = select_proxy
+
+
+class RequestsResponseAdapter(Response):
+ def __init__(self, res: requests.models.Response):
+ super().__init__(
+ fp=res.raw, headers=res.headers, url=res.url,
+ status=res.status_code, reason=res.reason)
+
+ self._requests_response = res
+
+ def read(self, amt: int = None):
+ try:
+ # Interact with urllib3 response directly.
+ return self.fp.read(amt, decode_content=True)
+
+ # See urllib3.response.HTTPResponse.read() for exceptions raised on read
+ except urllib3.exceptions.SSLError as e:
+ raise SSLError(cause=e) from e
+
+ except urllib3.exceptions.ProtocolError as e:
+ # IncompleteRead is always contained within ProtocolError
+ # See urllib3.response.HTTPResponse._error_catcher()
+ ir_err = next(
+ (err for err in (e.__context__, e.__cause__, *variadic(e.args))
+ if isinstance(err, http.client.IncompleteRead)), None)
+ if ir_err is not None:
+ # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
+ # but uses an `int` for its `partial` property.
+ partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
+ raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
+ raise TransportError(cause=e) from e
+
+ except urllib3.exceptions.HTTPError as e:
+ # catch-all for any other urllib3 response exceptions
+ raise TransportError(cause=e) from e
+
+
+class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
+ def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
+ self._pm_args = {}
+ if ssl_context:
+ self._pm_args['ssl_context'] = ssl_context
+ if source_address:
+ self._pm_args['source_address'] = (source_address, 0)
+ self._proxy_ssl_context = proxy_ssl_context or ssl_context
+ super().__init__(**kwargs)
+
+ def init_poolmanager(self, *args, **kwargs):
+ return super().init_poolmanager(*args, **kwargs, **self._pm_args)
+
+ def proxy_manager_for(self, proxy, **proxy_kwargs):
+ extra_kwargs = {}
+ if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
+ extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
+ return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
+
+ def cert_verify(*args, **kwargs):
+ # lean on SSLContext for cert verification
+ pass
+
+
+class RequestsSession(requests.sessions.Session):
+ """
+ Ensure unified redirect method handling with our urllib redirect handler.
+ """
+
+ def rebuild_method(self, prepared_request, response):
+ new_method = get_redirect_method(prepared_request.method, response.status_code)
+
+ # HACK: requests removes headers/body on redirect unless code was a 307/308.
+ if new_method == prepared_request.method:
+ response._real_status_code = response.status_code
+ response.status_code = 308
+
+ prepared_request.method = new_method
+
+ # Requests fails to resolve dot segments on absolute redirect locations
+ # See: https://github.com/yt-dlp/yt-dlp/issues/9020
+ prepared_request.url = normalize_url(prepared_request.url)
+
+ def rebuild_auth(self, prepared_request, response):
+ # HACK: undo status code change from rebuild_method, if applicable.
+ # rebuild_auth runs after requests would remove headers/body based on status code
+ if hasattr(response, '_real_status_code'):
+ response.status_code = response._real_status_code
+ del response._real_status_code
+ return super().rebuild_auth(prepared_request, response)
+
+
+class Urllib3LoggingFilter(logging.Filter):
+
+ def filter(self, record):
+ # Ignore HTTP request messages since HTTPConnection prints those
+ if record.msg == '%s://%s:%s "%s %s %s" %s %s':
+ return False
+ return True
+
+
+class Urllib3LoggingHandler(logging.Handler):
+ """Redirect urllib3 logs to our logger"""
+
+ def __init__(self, logger, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._logger = logger
+
+ def emit(self, record):
+ try:
+ msg = self.format(record)
+ if record.levelno >= logging.ERROR:
+ self._logger.error(msg)
+ else:
+ self._logger.stdout(msg)
+
+ except Exception:
+ self.handleError(record)
+
+
+@register_rh
+class RequestsRH(RequestHandler, InstanceStoreMixin):
+
+ """Requests RequestHandler
+ https://github.com/psf/requests
+ """
+ _SUPPORTED_URL_SCHEMES = ('http', 'https')
+ _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
+ _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
+ _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
+ RH_NAME = 'requests'
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # Forward urllib3 debug messages to our logger
+ logger = logging.getLogger('urllib3')
+ self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
+ self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
+ self.__logging_handler.addFilter(Urllib3LoggingFilter())
+ logger.addHandler(self.__logging_handler)
+ # TODO: Use a logger filter to suppress pool reuse warning instead
+ logger.setLevel(logging.ERROR)
+
+ if self.verbose:
+ # Setting this globally is not ideal, but is easier than hacking with urllib3.
+ # It could technically be problematic for scripts embedding yt-dlp.
+ # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
+ urllib3.connection.HTTPConnection.debuglevel = 1
+ logger.setLevel(logging.DEBUG)
+ # this is expected if we are using --no-check-certificate
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+ def close(self):
+ self._clear_instances()
+ # Remove the logging handler that contains a reference to our logger
+ # See: https://github.com/yt-dlp/yt-dlp/issues/8922
+ logging.getLogger('urllib3').removeHandler(self.__logging_handler)
+
+ def _check_extensions(self, extensions):
+ super()._check_extensions(extensions)
+ extensions.pop('cookiejar', None)
+ extensions.pop('timeout', None)
+
+ def _create_instance(self, cookiejar):
+ session = RequestsSession()
+ http_adapter = RequestsHTTPAdapter(
+ ssl_context=self._make_sslcontext(),
+ source_address=self.source_address,
+ max_retries=urllib3.util.retry.Retry(False),
+ )
+ session.adapters.clear()
+ session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
+ session.mount('https://', http_adapter)
+ session.mount('http://', http_adapter)
+ session.cookies = cookiejar
+ session.trust_env = False # no need, we already load proxies from env
+ return session
+
+ def _send(self, request):
+
+ headers = self._merge_headers(request.headers)
+ add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
+
+ max_redirects_exceeded = False
+
+ session = self._get_instance(
+ cookiejar=request.extensions.get('cookiejar') or self.cookiejar)
+
+ try:
+ requests_res = session.request(
+ method=request.method,
+ url=request.url,
+ data=request.data,
+ headers=headers,
+ timeout=float(request.extensions.get('timeout') or self.timeout),
+ proxies=request.proxies or self.proxies,
+ allow_redirects=True,
+ stream=True
+ )
+
+ except requests.exceptions.TooManyRedirects as e:
+ max_redirects_exceeded = True
+ requests_res = e.response
+
+ except requests.exceptions.SSLError as e:
+ if 'CERTIFICATE_VERIFY_FAILED' in str(e):
+ raise CertificateVerifyError(cause=e) from e
+ raise SSLError(cause=e) from e
+
+ except requests.exceptions.ProxyError as e:
+ raise ProxyError(cause=e) from e
+
+ except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
+ raise TransportError(cause=e) from e
+
+ except urllib3.exceptions.HTTPError as e:
+ # Catch any urllib3 exceptions that may leak through
+ raise TransportError(cause=e) from e
+
+ except requests.exceptions.RequestException as e:
+ # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
+ raise RequestError(cause=e) from e
+
+ res = RequestsResponseAdapter(requests_res)
+
+ if not 200 <= res.status < 300:
+ raise HTTPError(res, redirect_loop=max_redirects_exceeded)
+
+ return res
+
+
+@register_preference(RequestsRH)
+def requests_preference(rh, request):
+ return 100
+
+
+# Use our socks proxy implementation with requests to avoid an extra dependency.
+class SocksHTTPConnection(urllib3.connection.HTTPConnection):
+ def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
+ self._proxy_args = _socks_options
+ super().__init__(*args, **kwargs)
+
+ def _new_conn(self):
+ try:
+ return create_connection(
+ address=(self._proxy_args['addr'], self._proxy_args['port']),
+ timeout=self.timeout,
+ source_address=self.source_address,
+ _create_socket_func=functools.partial(
+ create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
+ except (socket.timeout, TimeoutError) as e:
+ raise urllib3.exceptions.ConnectTimeoutError(
+ self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
+ except SocksProxyError as e:
+ raise urllib3.exceptions.ProxyError(str(e), e) from e
+ except OSError as e:
+ raise urllib3.exceptions.NewConnectionError(
+ self, f'Failed to establish a new connection: {e}') from e
+
+
+class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
+ pass
+
+
+class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
+ ConnectionCls = SocksHTTPConnection
+
+
+class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
+ ConnectionCls = SocksHTTPSConnection
+
+
+class SocksProxyManager(urllib3.PoolManager):
+
+ def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
+ connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
+ super().__init__(num_pools, headers, **connection_pool_kw)
+ self.pool_classes_by_scheme = {
+ 'http': SocksHTTPConnectionPool,
+ 'https': SocksHTTPSConnectionPool
+ }
+
+
+requests.adapters.SOCKSProxyManager = SocksProxyManager