From 2415e66f889f38503b73e8ebc5f43ca342390e5c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 18:49:24 +0200 Subject: Adding upstream version 2024.03.10. Signed-off-by: Daniel Baumann --- yt_dlp/extractor/common.py | 3943 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3943 insertions(+) create mode 100644 yt_dlp/extractor/common.py (limited to 'yt_dlp/extractor/common.py') diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py new file mode 100644 index 0000000..e776cca --- /dev/null +++ b/yt_dlp/extractor/common.py @@ -0,0 +1,3943 @@ +import base64 +import collections +import getpass +import hashlib +import http.client +import http.cookiejar +import http.cookies +import inspect +import itertools +import json +import math +import netrc +import os +import random +import re +import subprocess +import sys +import time +import types +import urllib.parse +import urllib.request +import xml.etree.ElementTree + +from ..compat import functools # isort: split +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) +from ..cookies import LenientSimpleCookie +from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) +from ..utils import ( + IDENTITY, + JSON_LD_RE, + NO_DEFAULT, + ExtractorError, + FormatSorter, + GeoRestrictedError, + GeoUtils, + LenientJSONDecoder, + Popen, + RegexNotFoundError, + RetryManager, + UnsupportedError, + age_restricted, + base_url, + bug_reports_message, + classproperty, + clean_html, + deprecation_warning, + determine_ext, + dict_get, + encode_data_uri, + error_to_compat_str, + extract_attributes, + filter_dict, + fix_xml_ampersands, + float_or_none, + format_field, + int_or_none, + join_nonempty, + js_to_json, + mimetype2ext, + netrc_from_content, + orderedSet, + parse_bitrate, + parse_codecs, + parse_duration, + parse_iso8601, + parse_m3u8_attributes, + parse_resolution, + sanitize_filename, + sanitize_url, + smuggle_url, + str_or_none, + str_to_int, + strip_or_none, + traverse_obj, + truncate_string, + try_call, + try_get, + unescapeHTML, + unified_strdate, + unified_timestamp, + url_basename, + url_or_none, + urlhandle_detect_ext, + urljoin, + variadic, + xpath_element, + xpath_text, + xpath_with_ns, +) + + +class InfoExtractor: + """Information Extractor class. + + Information extractors are the classes that, given a URL, extract + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the YoutubeDL. The YoutubeDL processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The type field determines the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: + + id: Video identifier. + title: Video title, unescaped. Set to an empty string if video has + no title as opposed to "None" which signifies that the + extractor failed to obtain a title + + Additionally, it must contain either a formats entry or a url one: + + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: + * url The mandatory URL representing the media: + for plain file media - HTTP URL of this file, + for RTMP - RTMP URL, + for HLS - URL of the M3U8 media playlist, + for HDS - URL of the F4M manifest, + for DASH + - HTTP URL to plain file media (in case of + unfragmented media) + - URL of the MPD manifest or base URL + representing the media if MPD manifest + is parsed from a string (in case of + fragmented media) + for MSS - URL of the ISM manifest. + * request_data Data to send in POST request to the URL + * manifest_url + The URL of the manifest file in case of + fragmented media: + for HLS - URL of the M3U8 master playlist, + for HDS - URL of the F4M manifest, + for DASH - URL of the MPD manifest, + for MSS - URL of the ISM manifest. + * manifest_stream_number (For internal use only) + The index of the stream in the manifest file + * ext Will be calculated from URL if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * aspect_ratio Aspect ratio of the video, if known + Automatically calculated from width and height + * resolution Textual description of width and height + Automatically calculated from width and height + * dynamic_range The dynamic range of the video. One of: + "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" + * tbr Average bitrate of audio and video in KBit/s + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz + * audio_channels Number of audio channels + * vbr Average video bitrate in KBit/s + * fps Frame rate + * vcodec Name of the video codec in use + * container Name of the container format + * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes + * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. One of "http", "https" or + one of the protocols defined in downloader.PROTOCOL_MAP + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url + * "duration" (optional, int or float) + * "filesize" (optional, int) + * is_from_start Is a live format that can be downloaded + from the start. Boolean + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field, regardless of all other values. + -1 for default (order by other properties), + -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language Language code, e.g. "de" or "en-US". + * language_preference Is this in the language mentioned in + the URL? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. + * no_resume The server does not support resuming the + (HTTP or RTMP) download. Boolean. + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. + * extra_param_to_segment_url A query string to append to each + fragment's URL, or to update each existing query string + with. Only applied by the native HLS/DASH downloaders. + * hls_aes A dictionary of HLS AES-128 decryption information + used by the native HLS downloader to override the + values in the media playlist when an '#EXT-X-KEY' tag + is present in the playlist: + * uri The URI from which the key will be downloaded + * key The key (as hex) used to decrypt fragments. + If `key` is given, any key URI will be ignored + * iv The IV (as hex) used to decrypt fragments + * downloader_options A dictionary of downloader options + (For internal use only) + * http_chunk_size Chunk size for HTTP downloads + * ffmpeg_args Extra arguments for ffmpeg downloader + * is_dash_periods Whether the format is a result of merging + multiple DASH periods. + RTMP formats can also have the additional fields: page_url, + app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, + rtmp_protocol, rtmp_real_time + + url: Final video URL. + ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). + + The following fields are optional: + + direct: True if a direct video file was given (must only be set by GenericIE) + alt_title: A secondary title of the video. + display_id: An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "id" (optional, string) - Thumbnail format ID + * "url" + * "preference" (optional, int) - quality of the image + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height}", + deprecated) + * "filesize" (optional, int) + * "http_headers" (dict) - HTTP headers for the request + thumbnail: Full URL to a video thumbnail image. + description: Full video description. + uploader: Full name of the video uploader. + license: License name the video is licensed under. + creators: List of creators of the video. + timestamp: UNIX timestamp of the moment the video was uploaded + upload_date: Video upload date in UTC (YYYYMMDD). + If not explicitly set, calculated from timestamp + release_timestamp: UNIX timestamp of the moment the video was released. + If it is not clear whether to use timestamp or this, use the former + release_date: The date (YYYYMMDD) when the video was released in UTC. + If not explicitly set, calculated from release_timestamp + release_year: Year (YYYY) as integer when the video or album was released. + To be used if no exact release date is known. + If not explicitly set, calculated from release_date. + modified_timestamp: UNIX timestamp of the moment the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified in UTC. + If not explicitly set, calculated from modified_timestamp + uploader_id: Nickname or id of the video uploader. + uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. + channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. + location: Physical location where the video was filmed. + subtitles: The available subtitles as a dictionary in the format + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: + * "data": The subtitles file contents + * "url": A URL pointing to the subtitles file + It can optionally also have: + * "name": Name or description of the subtitles + * "http_headers": A dictionary of additional HTTP headers + to add to the request. + "ext" will be calculated from URL if missing + automatic_captions: Like 'subtitles'; contains automatically generated + captions instead of normal subtitles + duration: Length of the video in seconds, as an integer or float. + view_count: How many users have watched the video on the platform. + concurrent_view_count: How many users are currently watching the video on the platform. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video + average_rating: Average rating give by users, the scale used depends on the webpage + comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. + * "like_count" - Number of positive ratings of the comment + * "dislike_count" - Number of negative ratings of the comment + * "is_favorited" - Whether the comment is marked as + favorite by the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments + age_limit: Age restriction for the video, as an integer (years) + webpage_url: The URL to the video webpage, if given to yt-dlp it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] + cast: A list of the video cast + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. + was_live: True, False, or None (=unknown). Whether this video was + originally a live stream. + live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live', + or 'post_live' (was live, but VOD is not yet processed) + If absent, automatically set from is_live, was_live + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) + playable_in_embed: Whether this video is allowed to play in embedded + players on other sites. Can be True (=always allowed), + False (=never allowed), None (=unknown), or a string + specifying the criteria for embedability; e.g. 'whitelist' + availability: Under what condition the video is available. One of + 'private', 'premium_only', 'subscriber_only', 'needs_auth', + 'unlisted' or 'public'. Use 'InfoExtractor._availability' + to set it + media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer" + _old_archive_ids: A list of old archive ids needed for backward compatibility + _format_sort_fields: A list of fields to use for sorting formats + __post_extractor: A function to be called just before the metadata is + written to either disk, logger or console. The function + must return a dict which will be added to the info_dict. + This is usefull for additional information that is + time-consuming to extract. Note that the fields thus + extracted will not be available to output template and + match_filter. So, only "comments" and "comment_count" are + currently allowed to be extracted via this method. + + The following fields should only be used when the video belongs to some logical + chapter or section: + + chapter: Name or title of the chapter the video belongs to. + chapter_number: Number of the chapter the video belongs to, as an integer. + chapter_id: Id of the chapter the video belongs to, as a unicode string. + + The following fields should only be used when the video is an episode of some + series, programme or podcast: + + series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. + season: Title of the season the video episode belongs to. + season_number: Number of the season the video episode belongs to, as an integer. + season_id: Id of the season the video episode belongs to, as a unicode string. + episode: Title of the video episode. Unlike mandatory video title field, + this field should denote the exact title of the video episode + without any kind of decoration. + episode_number: Number of the video episode within a season, as an integer. + episode_id: Id of the video episode, as a unicode string. + + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artists: List of artists of the track. + composers: List of composers of the piece. + genres: List of genres of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artists: List of all artists appeared on the album. + E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"]. + Useful for splits and compilations. + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + + The following fields should only be set for clips that should be cut from the original video: + + section_start: Start time of the section in seconds + section_end: End time of the section in seconds + + The following fields should only be set for storyboards: + rows: Number of rows in each storyboard fragment, as an integer + columns: Number of columns in each storyboard fragment, as an integer + + The following fields are deprecated and should not be set by new code: + composer: Use "composers" instead. + Composer(s) of the piece, comma-separated. + artist: Use "artists" instead. + Artist(s) of the track, comma-separated. + genre: Use "genres" instead. + Genre(s) of the track, comma-separated. + album_artist: Use "album_artists" instead. + All artists appeared on the album, comma-separated. + creator: Use "creators" instead. + The creator of the video. + + Unless mentioned otherwise, the fields should be Unicode strings. + + Unless mentioned otherwise, None is equivalent to absence of information. + + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "id", "title", and any other relevant + attributes with the same semantics as videos (see above). + + It can also have the following optional fields: + + playlist_count: The total number of videos in a playlist. If not given, + YoutubeDL tries to calculate it from "entries" + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + + Subclasses of this should also be added to the list of extractors and + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. + + Subclasses may also override suitable() if necessary, but ensure the function + signature is preserved and that this function imports everything it needs + (except other extractors), so that lazy_extractors works correctly. + + Subclasses can define a list of _EMBED_REGEX, which will be searched for in + the HTML of Generic webpages. It may also override _extract_embed_urls + or _extract_from_webpage as necessary. While these are normally classmethods, + _extract_from_webpage is allowed to be an instance method. + + _extract_from_webpage may raise self.StopExtraction() to stop further + processing of the webpage and obtain exclusive rights to it. This is useful + when the extractor cannot reliably be matched using just the URL, + e.g. invidious/peertube instances + + Embed-only extractors can be defined by setting _VALID_URL = False. + + To support username + password (or netrc) login, the extractor must define a + _NETRC_MACHINE and re-define _perform_login(username, password) and + (optionally) _initialize_pre_login() methods. The _perform_login method will + be called between _initialize_pre_login and _real_initialize if credentials + are passed by the user. In cases where it is necessary to have the login + process as part of the extraction rather than initialization, _perform_login + can be left undefined. + + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. + + _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted + IP blocks in CIDR notation for this extractor. One of these IP blocks + will be used by geo restriction bypass mechanism similarly + to _GEO_COUNTRIES. + + The _ENABLED attribute should be set to False for IEs that + are disabled by default and must be explicitly enabled. + + The _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. + """ + + _ready = False + _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None + _GEO_IP_BLOCKS = None + _WORKING = True + _ENABLED = True + _NETRC_MACHINE = None + IE_DESC = None + SEARCH_KEY = None + _VALID_URL = None + _EMBED_REGEX = [] + + def _login_hint(self, method=NO_DEFAULT, netrc=None): + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + return { + None: '', + 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', + 'password': f'Use {password_hint}', + 'cookies': ( + 'Use --cookies-from-browser or --cookies for the authentication. ' + 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'), + }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies'] + + def __init__(self, downloader=None): + """Constructor. Receives an optional downloader (a YoutubeDL instance). + If a downloader is not passed during initialization, + it must be set using "set_downloader()" before "extract()" is called""" + self._ready = False + self._x_forwarded_for_ip = None + self._printed_messages = set() + self.set_downloader(downloader) + + @classmethod + def _match_valid_url(cls, url): + if cls._VALID_URL is False: + return None + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + # This function must import everything it needs (except other extractors), + # so that lazy_extractors works correctly + return cls._match_valid_url(url) is not None + + @classmethod + def _match_id(cls, url): + return cls._match_valid_url(url).group('id') + + @classmethod + def get_temp_id(cls, url): + try: + return cls._match_id(url) + except (IndexError, AttributeError): + return None + + @classmethod + def working(cls): + """Getter method for _WORKING.""" + return cls._WORKING + + @classmethod + def supports_login(cls): + return bool(cls._NETRC_MACHINE) + + def initialize(self): + """Initializes an instance (authentication, etc).""" + self._printed_messages = set() + self._initialize_geo_bypass({ + 'countries': self._GEO_COUNTRIES, + 'ip_blocks': self._GEO_IP_BLOCKS, + }) + if not self._ready: + self._initialize_pre_login() + if self.supports_login(): + username, password = self._get_login_info() + if username: + self._perform_login(username, password) + elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE): + self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}') + self._real_initialize() + self._ready = True + + def _initialize_geo_bypass(self, geo_bypass_context): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES and + _GEO_IP_BLOCKS. + + You may also manually call it from extractor's code if geo bypass + information is not available beforehand (e.g. obtained during + extraction) or due to some other reason. In this case you should pass + this information in geo bypass context passed as first argument. It may + contain following fields: + + countries: List of geo unrestricted countries (similar + to _GEO_COUNTRIES) + ip_blocks: List of geo unrestricted IP blocks in CIDR notation + (similar to _GEO_IP_BLOCKS) + + """ + if not self._x_forwarded_for_ip: + + # Geo bypass mechanism is explicitly disabled by user + if not self.get_param('geo_bypass', True): + return + + if not geo_bypass_context: + geo_bypass_context = {} + + # Backward compatibility: previously _initialize_geo_bypass + # expected a list of countries, some 3rd party code may still use + # it this way + if isinstance(geo_bypass_context, (list, tuple)): + geo_bypass_context = { + 'countries': geo_bypass_context, + } + + # The whole point of geo bypass mechanism is to fake IP + # as X-Forwarded-For HTTP header based on some IP block or + # country code. + + # Path 1: bypassing based on IP block in CIDR notation + + # Explicit IP block specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + ip_block = self.get_param('geo_bypass_ip_block', None) + + # Otherwise use random IP block from geo bypass context but only + # if extractor is known as geo bypassable + if not ip_block: + ip_blocks = geo_bypass_context.get('ip_blocks') + if self._GEO_BYPASS and ip_blocks: + ip_block = random.choice(ip_blocks) + + if ip_block: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) + self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For') + return + + # Path 2: bypassing based on country code + + # Explicit country code specified by user, use it right away + # regardless of whether extractor is geo bypassable or not + country = self.get_param('geo_bypass_country', None) + + # Otherwise use random country code from geo bypass context but + # only if extractor is known as geo bypassable + if not country: + countries = geo_bypass_context.get('countries') + if self._GEO_BYPASS and countries: + country = random.choice(countries) + + if country: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) + self._downloader.write_debug( + f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For') + + def extract(self, url): + """Extracts URL information and returns it in list of dicts.""" + try: + for _ in range(2): + try: + self.initialize() + self.to_screen('Extracting URL: %s' % ( + url if self.get_param('verbose') else truncate_string(url, 100, 20))) + ie_result = self._real_extract(url) + if ie_result is None: + return None + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + subtitles = ie_result.get('subtitles') or {} + if 'no-live-chat' in self.get_param('compat_opts'): + for lang in ('live_chat', 'comments', 'danmaku'): + subtitles.pop(lang, None) + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise + except UnsupportedError: + raise + except ExtractorError as e: + e.video_id = e.video_id or self.get_temp_id(url) + e.ie = e.ie or self.IE_NAME + e.traceback = e.traceback or sys.exc_info()[2] + raise + except IncompleteRead as e: + raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) + except (KeyError, StopIteration) as e: + raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) + + def __maybe_fake_ip_and_retry(self, countries): + if (not self.get_param('geo_bypass_country', None) + and self._GEO_BYPASS + and self.get_param('geo_bypass', True) + and not self._x_forwarded_for_ip + and countries): + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + return True + return False + + def set_downloader(self, downloader): + """Sets a YoutubeDL instance as the downloader for this IE.""" + self._downloader = downloader + + @property + def cache(self): + return self._downloader.cache + + @property + def cookiejar(self): + return self._downloader.cookiejar + + def _initialize_pre_login(self): + """ Initialization before login. Redefine in subclasses.""" + pass + + def _perform_login(self, username, password): + """ Login with username and password. Redefine in subclasses.""" + pass + + def _real_initialize(self): + """Real initialization process. Redefine in subclasses.""" + pass + + def _real_extract(self, url): + """Real extraction process. Redefine in subclasses.""" + raise NotImplementedError('This method must be implemented by subclasses') + + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + + @classproperty + def IE_NAME(cls): + return cls.__name__[:-2] + + @staticmethod + def __can_accept_status_code(err, expected_status): + assert isinstance(err, HTTPError) + if expected_status is None: + return False + elif callable(expected_status): + return expected_status(err.status) is True + else: + return err.status in variadic(expected_status) + + def _create_request(self, url_or_request, data=None, headers=None, query=None): + if isinstance(url_or_request, urllib.request.Request): + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) + + url_or_request.update(data=data, headers=headers, query=query) + return url_or_request + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): + """ + Return the response handle. + + See _download_webpage docstring for arguments specification. + """ + if not self._downloader._first_webpage_request: + sleep_interval = self.get_param('sleep_interval_requests') or 0 + if sleep_interval > 0: + self.to_screen('Sleeping %s seconds ...' % sleep_interval) + time.sleep(sleep_interval) + else: + self._downloader._first_webpage_request = False + + if note is None: + self.report_download_webpage(video_id) + elif note is not False: + if video_id is None: + self.to_screen(str(note)) + else: + self.to_screen(f'{video_id}: {note}') + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + headers = (headers or {}).copy() + headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) + + try: + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) + except network_exceptions as err: + if isinstance(err, HTTPError): + if self.__can_accept_status_code(err, expected_status): + return err.response + + if errnote is False: + return False + if errnote is None: + errnote = 'Unable to download webpage' + + errmsg = f'{errnote}: {error_to_compat_str(err)}' + if fatal: + raise ExtractorError(errmsg, cause=err) + else: + self.report_warning(errmsg) + return False + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, + encoding=None, data=None, headers={}, query={}, expected_status=None): + """ + Return a tuple (page content as string, URL handle). + + Arguments: + url_or_request -- plain text URL as a string or + a urllib.request.Request object + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. + """ + + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, str): + url_or_request = url_or_request.partition('#')[0] + + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + if urlh is False: + assert not fatal + return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + return (content, urlh) + + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + m = re.search(br']+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' + else: + encoding = 'utf-8' + + return encoding + + def __check_blocked(self, content): + first_block = content[:512] + if ('Access to this site is blocked' in content + and 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'