summaryrefslogtreecommitdiffstats
path: root/yt_dlp
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp')
-rw-r--r--yt_dlp/YoutubeDL.py70
-rw-r--r--yt_dlp/__init__.py42
-rw-r--r--yt_dlp/__pyinstaller/hook-yt_dlp.py6
-rw-r--r--yt_dlp/compat/__init__.py9
-rw-r--r--yt_dlp/cookies.py10
-rw-r--r--yt_dlp/dependencies/__init__.py4
-rw-r--r--yt_dlp/downloader/common.py11
-rw-r--r--yt_dlp/downloader/external.py4
-rw-r--r--yt_dlp/extractor/_extractors.py9
-rw-r--r--yt_dlp/extractor/afreecatv.py404
-rw-r--r--yt_dlp/extractor/ard.py4
-rw-r--r--yt_dlp/extractor/asobistage.py154
-rw-r--r--yt_dlp/extractor/atvat.py8
-rw-r--r--yt_dlp/extractor/aws.py4
-rw-r--r--yt_dlp/extractor/bibeltv.py4
-rw-r--r--yt_dlp/extractor/box.py37
-rw-r--r--yt_dlp/extractor/bundestag.py8
-rw-r--r--yt_dlp/extractor/cbc.py86
-rw-r--r--yt_dlp/extractor/cda.py6
-rw-r--r--yt_dlp/extractor/common.py69
-rw-r--r--yt_dlp/extractor/crunchyroll.py143
-rw-r--r--yt_dlp/extractor/dropbox.py4
-rw-r--r--yt_dlp/extractor/dtube.py4
-rw-r--r--yt_dlp/extractor/fathom.py54
-rw-r--r--yt_dlp/extractor/generic.py16
-rw-r--r--yt_dlp/extractor/gofile.py13
-rw-r--r--yt_dlp/extractor/goplay.py4
-rw-r--r--yt_dlp/extractor/imgur.py18
-rw-r--r--yt_dlp/extractor/jiosaavn.py179
-rw-r--r--yt_dlp/extractor/joqrag.py12
-rw-r--r--yt_dlp/extractor/kick.py32
-rw-r--r--yt_dlp/extractor/leeco.py4
-rw-r--r--yt_dlp/extractor/linkedin.py4
-rw-r--r--yt_dlp/extractor/loom.py461
-rw-r--r--yt_dlp/extractor/masters.py1
-rw-r--r--yt_dlp/extractor/medici.py182
-rw-r--r--yt_dlp/extractor/microsoftstream.py4
-rw-r--r--yt_dlp/extractor/mixch.py64
-rw-r--r--yt_dlp/extractor/motherless.py4
-rw-r--r--yt_dlp/extractor/naver.py4
-rw-r--r--yt_dlp/extractor/neteasemusic.py8
-rw-r--r--yt_dlp/extractor/nhk.py202
-rw-r--r--yt_dlp/extractor/niconico.py11
-rw-r--r--yt_dlp/extractor/panopto.py10
-rw-r--r--yt_dlp/extractor/patreon.py44
-rw-r--r--yt_dlp/extractor/polsatgo.py4
-rw-r--r--yt_dlp/extractor/pr0gramm.py6
-rw-r--r--yt_dlp/extractor/prosiebensat1.py10
-rw-r--r--yt_dlp/extractor/radiokapital.py14
-rw-r--r--yt_dlp/extractor/rokfin.py4
-rw-r--r--yt_dlp/extractor/sejmpl.py14
-rw-r--r--yt_dlp/extractor/sharepoint.py112
-rw-r--r--yt_dlp/extractor/sonyliv.py64
-rw-r--r--yt_dlp/extractor/soundcloud.py109
-rw-r--r--yt_dlp/extractor/telewebion.py11
-rw-r--r--yt_dlp/extractor/tenplay.py4
-rw-r--r--yt_dlp/extractor/thisoldhouse.py52
-rw-r--r--yt_dlp/extractor/tiktok.py172
-rw-r--r--yt_dlp/extractor/twitch.py10
-rw-r--r--yt_dlp/extractor/vk.py15
-rw-r--r--yt_dlp/extractor/vrt.py62
-rw-r--r--yt_dlp/extractor/wistia.py4
-rw-r--r--yt_dlp/extractor/xvideos.py34
-rw-r--r--yt_dlp/extractor/youtube.py32
-rw-r--r--yt_dlp/extractor/zattoo.py4
-rw-r--r--yt_dlp/networking/__init__.py7
-rw-r--r--yt_dlp/networking/_curlcffi.py221
-rw-r--r--yt_dlp/networking/_helper.py4
-rw-r--r--yt_dlp/networking/_requests.py7
-rw-r--r--yt_dlp/networking/_urllib.py6
-rw-r--r--yt_dlp/networking/_websockets.py38
-rw-r--r--yt_dlp/networking/common.py14
-rw-r--r--yt_dlp/networking/impersonate.py141
-rw-r--r--yt_dlp/options.py20
-rw-r--r--yt_dlp/update.py2
-rw-r--r--yt_dlp/utils/_utils.py129
-rw-r--r--yt_dlp/utils/traversal.py37
-rw-r--r--yt_dlp/version.py6
78 files changed, 2862 insertions, 938 deletions
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index c34d97b..9f730d0 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -1,7 +1,7 @@
import collections
import contextlib
import copy
-import datetime
+import datetime as dt
import errno
import fileinput
import http.cookiejar
@@ -25,7 +25,7 @@ import unicodedata
from .cache import Cache
from .compat import functools, urllib # isort: split
-from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req
+from .compat import compat_os_name, urllib_req_to_req
from .cookies import LenientSimpleCookie, load_cookies
from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
from .downloader.rtmp import rtmpdump_version
@@ -42,6 +42,7 @@ from .networking.exceptions import (
SSLError,
network_exceptions,
)
+from .networking.impersonate import ImpersonateRequestHandler
from .plugins import directories as plugin_directories
from .postprocessor import _PLUGIN_CLASSES as plugin_pps
from .postprocessor import (
@@ -99,8 +100,8 @@ from .utils import (
SameFileError,
UnavailableVideoError,
UserNotLive,
+ YoutubeDLError,
age_restricted,
- args_to_str,
bug_reports_message,
date_from_str,
deprecation_warning,
@@ -139,11 +140,13 @@ from .utils import (
sanitize_filename,
sanitize_path,
sanitize_url,
+ shell_quote,
str_or_none,
strftime_or_none,
subtitles_filename,
supports_terminal_sequences,
system_identifier,
+ filesize_from_tbr,
timetuple_from_msec,
to_high_limit_path,
traverse_obj,
@@ -402,6 +405,8 @@ class YoutubeDL:
- "detect_or_warn": check whether we can do anything
about it, warn otherwise (default)
source_address: Client-side IP address to bind to.
+ impersonate: Client to impersonate for requests.
+ An ImpersonateTarget (from yt_dlp.networking.impersonate)
sleep_interval_requests: Number of seconds to sleep between requests
during extraction
sleep_interval: Number of seconds to sleep before each download when
@@ -476,7 +481,7 @@ class YoutubeDL:
nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize,
max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries,
continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size,
- external_downloader_args, concurrent_fragment_downloads.
+ external_downloader_args, concurrent_fragment_downloads, progress_delta.
The following options are used by the post processors:
ffmpeg_location: Location of the ffmpeg/avconv binary; either the path
@@ -713,6 +718,13 @@ class YoutubeDL:
for msg in self.params.get('_deprecation_warnings', []):
self.deprecated_feature(msg)
+ if impersonate_target := self.params.get('impersonate'):
+ if not self._impersonate_target_available(impersonate_target):
+ raise YoutubeDLError(
+ f'Impersonate target "{impersonate_target}" is not available. '
+ f'Use --list-impersonate-targets to see available targets. '
+ f'You may be missing dependencies required to support this target.')
+
if 'list-formats' in self.params['compat_opts']:
self.params['listformats_table'] = False
@@ -811,7 +823,7 @@ class YoutubeDL:
self.report_warning(
'Long argument string detected. '
'Use -- to separate parameters and URLs, like this:\n%s' %
- args_to_str(correct_argv))
+ shell_quote(correct_argv))
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
@@ -1343,7 +1355,7 @@ class YoutubeDL:
value, fmt = escapeHTML(str(value)), str_fmt
elif fmt[-1] == 'q': # quoted
value = map(str, variadic(value) if '#' in flags else [value])
- value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt
+ value, fmt = shell_quote(value, shell=True), str_fmt
elif fmt[-1] == 'B': # bytes
value = f'%{str_fmt}'.encode() % str(value).encode()
value, fmt = value.decode('utf-8', 'ignore'), 's'
@@ -2617,7 +2629,7 @@ class YoutubeDL:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
# see http://bugs.python.org/issue1646728)
with contextlib.suppress(ValueError, OverflowError, OSError):
- upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc)
+ upload_date = dt.datetime.fromtimestamp(info_dict[ts_key], dt.timezone.utc)
info_dict[date_key] = upload_date.strftime('%Y%m%d')
if not info_dict.get('release_year'):
@@ -2771,7 +2783,7 @@ class YoutubeDL:
get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
if not get_from_start:
- info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+ info_dict['title'] += ' ' + dt.datetime.now().strftime('%Y-%m-%d %H:%M')
if info_dict.get('is_live') and formats:
formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
if get_from_start and not formats:
@@ -2802,6 +2814,9 @@ class YoutubeDL:
format['url'] = sanitize_url(format['url'])
if format.get('ext') is None:
format['ext'] = determine_ext(format['url']).lower()
+ if format['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'):
+ if format.get('acodec') is None:
+ format['acodec'] = format['ext']
if format.get('protocol') is None:
format['protocol'] = determine_protocol(format)
if format.get('resolution') is None:
@@ -2812,9 +2827,8 @@ class YoutubeDL:
format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2))
# For fragmented formats, "tbr" is often max bitrate and not average
if (('manifest-filesize-approx' in self.params['compat_opts'] or not format.get('manifest_url'))
- and info_dict.get('duration') and format.get('tbr')
and not format.get('filesize') and not format.get('filesize_approx')):
- format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
+ format['filesize_approx'] = filesize_from_tbr(format.get('tbr'), info_dict.get('duration'))
format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True)
# Safeguard against old/insecure infojson when using --load-info-json
@@ -3864,8 +3878,8 @@ class YoutubeDL:
delim, (
format_field(f, 'filesize', ' \t%s', func=format_bytes)
or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes)
- or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))),
- None, self._format_out('~\t%s', self.Styles.SUPPRESS))),
+ or format_field(filesize_from_tbr(f.get('tbr'), info_dict.get('duration')), None,
+ self._format_out('~\t%s', self.Styles.SUPPRESS), func=format_bytes)),
format_field(f, 'tbr', '\t%dk', func=round),
shorten_protocol_name(f.get('protocol', '')),
delim,
@@ -4077,6 +4091,22 @@ class YoutubeDL:
handler = self._request_director.handlers['Urllib']
return handler._get_instance(cookiejar=self.cookiejar, proxies=self.proxies)
+ def _get_available_impersonate_targets(self):
+ # todo(future): make available as public API
+ return [
+ (target, rh.RH_NAME)
+ for rh in self._request_director.handlers.values()
+ if isinstance(rh, ImpersonateRequestHandler)
+ for target in rh.supported_targets
+ ]
+
+ def _impersonate_target_available(self, target):
+ # todo(future): make available as public API
+ return any(
+ rh.is_supported_target(target)
+ for rh in self._request_director.handlers.values()
+ if isinstance(rh, ImpersonateRequestHandler))
+
def urlopen(self, req):
""" Start an HTTP download """
if isinstance(req, str):
@@ -4108,9 +4138,13 @@ class YoutubeDL:
raise RequestError(
'file:// URLs are disabled by default in yt-dlp for security reasons. '
'Use --enable-file-urls to enable at your own risk.', cause=ue) from ue
- if 'unsupported proxy type: "https"' in ue.msg.lower():
+ if (
+ 'unsupported proxy type: "https"' in ue.msg.lower()
+ and 'requests' not in self._request_director.handlers
+ and 'curl_cffi' not in self._request_director.handlers
+ ):
raise RequestError(
- 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests')
+ 'To use an HTTPS proxy for this request, one of the following dependencies needs to be installed: requests, curl_cffi')
elif (
re.match(r'unsupported url scheme: "wss?"', ue.msg.lower())
@@ -4120,6 +4154,13 @@ class YoutubeDL:
'This request requires WebSocket support. '
'Ensure one of the following dependencies are installed: websockets',
cause=ue) from ue
+
+ elif re.match(r'unsupported (?:extensions: impersonate|impersonate target)', ue.msg.lower()):
+ raise RequestError(
+ f'Impersonate target "{req.extensions["impersonate"]}" is not available.'
+ f' See --list-impersonate-targets for available targets.'
+ f' This request requires browser impersonation, however you may be missing dependencies'
+ f' required to support this target.')
raise
except SSLError as e:
if 'UNSAFE_LEGACY_RENEGOTIATION_DISABLED' in str(e):
@@ -4152,6 +4193,7 @@ class YoutubeDL:
'timeout': 'socket_timeout',
'legacy_ssl_support': 'legacyserverconnect',
'enable_file_urls': 'enable_file_urls',
+ 'impersonate': 'impersonate',
'client_cert': {
'client_certificate': 'client_certificate',
'client_certificate_key': 'client_certificate_key',
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
index aeea262..3d606bc 100644
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -19,6 +19,7 @@ from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS
from .downloader.external import get_external_downloader
from .extractor import list_extractor_classes
from .extractor.adobepass import MSO_INFO
+from .networking.impersonate import ImpersonateTarget
from .options import parseOpts
from .postprocessor import (
FFmpegExtractAudioPP,
@@ -48,6 +49,7 @@ from .utils import (
float_or_none,
format_field,
int_or_none,
+ join_nonempty,
match_filter_func,
parse_bytes,
parse_duration,
@@ -388,6 +390,9 @@ def validate_options(opts):
f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}')
opts.cookiesfrombrowser = (browser_name, profile, keyring, container)
+ if opts.impersonate is not None:
+ opts.impersonate = ImpersonateTarget.from_str(opts.impersonate.lower())
+
# MetadataParser
def metadataparser_actions(f):
if isinstance(f, str):
@@ -831,6 +836,7 @@ def parse_options(argv=None):
'noprogress': opts.quiet if opts.noprogress is None else opts.noprogress,
'progress_with_newline': opts.progress_with_newline,
'progress_template': opts.progress_template,
+ 'progress_delta': opts.progress_delta,
'playliststart': opts.playliststart,
'playlistend': opts.playlistend,
'playlistreverse': opts.playlist_reverse,
@@ -911,6 +917,7 @@ def parse_options(argv=None):
'postprocessors': postprocessors,
'fixup': opts.fixup,
'source_address': opts.source_address,
+ 'impersonate': opts.impersonate,
'call_home': opts.call_home,
'sleep_interval_requests': opts.sleep_interval_requests,
'sleep_interval': opts.sleep_interval,
@@ -980,6 +987,41 @@ def _real_main(argv=None):
traceback.print_exc()
ydl._download_retcode = 100
+ if opts.list_impersonate_targets:
+
+ known_targets = [
+ # List of simplified targets we know are supported,
+ # to help users know what dependencies may be required.
+ (ImpersonateTarget('chrome'), 'curl_cffi'),
+ (ImpersonateTarget('edge'), 'curl_cffi'),
+ (ImpersonateTarget('safari'), 'curl_cffi'),
+ ]
+
+ available_targets = ydl._get_available_impersonate_targets()
+
+ def make_row(target, handler):
+ return [
+ join_nonempty(target.client.title(), target.version, delim='-') or '-',
+ join_nonempty((target.os or "").title(), target.os_version, delim='-') or '-',
+ handler,
+ ]
+
+ rows = [make_row(target, handler) for target, handler in available_targets]
+
+ for known_target, known_handler in known_targets:
+ if not any(
+ known_target in target and handler == known_handler
+ for target, handler in available_targets
+ ):
+ rows.append([
+ ydl._format_out(text, ydl.Styles.SUPPRESS)
+ for text in make_row(known_target, f'{known_handler} (not available)')
+ ])
+
+ ydl.to_screen('[info] Available impersonate targets')
+ ydl.to_stdout(render_table(['Client', 'OS', 'Source'], rows, extra_gap=2, delim='-'))
+ return
+
if not actual_use:
if pre_process:
return ydl._download_retcode
diff --git a/yt_dlp/__pyinstaller/hook-yt_dlp.py b/yt_dlp/__pyinstaller/hook-yt_dlp.py
index 7c3dbfb..8e7f42f 100644
--- a/yt_dlp/__pyinstaller/hook-yt_dlp.py
+++ b/yt_dlp/__pyinstaller/hook-yt_dlp.py
@@ -1,6 +1,6 @@
import sys
-from PyInstaller.utils.hooks import collect_submodules
+from PyInstaller.utils.hooks import collect_submodules, collect_data_files
def pycryptodome_module():
@@ -25,10 +25,12 @@ def get_hidden_imports():
for module in ('websockets', 'requests', 'urllib3'):
yield from collect_submodules(module)
# These are auto-detected, but explicitly add them just in case
- yield from ('mutagen', 'brotli', 'certifi', 'secretstorage')
+ yield from ('mutagen', 'brotli', 'certifi', 'secretstorage', 'curl_cffi')
hiddenimports = list(get_hidden_imports())
print(f'Adding imports: {hiddenimports}')
excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts', 'bundle']
+
+datas = collect_data_files('curl_cffi', includes=['cacert.pem'])
diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py
index 5ad5c70..d820ada 100644
--- a/yt_dlp/compat/__init__.py
+++ b/yt_dlp/compat/__init__.py
@@ -27,12 +27,9 @@ def compat_etree_fromstring(text):
compat_os_name = os._name if os.name == 'java' else os.name
-if compat_os_name == 'nt':
- def compat_shlex_quote(s):
- import re
- return s if re.match(r'^[-_\w./]+$', s) else s.replace('"', '""').join('""')
-else:
- from shlex import quote as compat_shlex_quote # noqa: F401
+def compat_shlex_quote(s):
+ from ..utils import shell_quote
+ return shell_quote(s)
def compat_ord(c):
diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py
index 28d174a..7b8d215 100644
--- a/yt_dlp/cookies.py
+++ b/yt_dlp/cookies.py
@@ -1,6 +1,7 @@
import base64
import collections
import contextlib
+import datetime as dt
import glob
import http.cookiejar
import http.cookies
@@ -15,7 +16,6 @@ import sys
import tempfile
import time
import urllib.request
-from datetime import datetime, timedelta, timezone
from enum import Enum, auto
from hashlib import pbkdf2_hmac
@@ -194,7 +194,11 @@ def _firefox_browser_dirs():
yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles')
else:
- yield from map(os.path.expanduser, ('~/.mozilla/firefox', '~/snap/firefox/common/.mozilla/firefox'))
+ yield from map(os.path.expanduser, (
+ '~/.mozilla/firefox',
+ '~/snap/firefox/common/.mozilla/firefox',
+ '~/.var/app/org.mozilla.firefox/.mozilla/firefox',
+ ))
def _firefox_cookie_dbs(roots):
@@ -594,7 +598,7 @@ class DataParser:
def _mac_absolute_time_to_posix(timestamp):
- return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=timestamp)).timestamp())
+ return int((dt.datetime(2001, 1, 1, 0, 0, tzinfo=dt.timezone.utc) + dt.timedelta(seconds=timestamp)).timestamp())
def _parse_safari_cookies_header(data, logger):
diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py
index 9e3f907..0d58da2 100644
--- a/yt_dlp/dependencies/__init__.py
+++ b/yt_dlp/dependencies/__init__.py
@@ -74,6 +74,10 @@ else:
if hasattr(xattr, 'set'): # pyxattr
xattr._yt_dlp__identifier = 'pyxattr'
+try:
+ import curl_cffi
+except ImportError:
+ curl_cffi = None
from . import Cryptodome
diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py
index b71d7ee..65a0d6f 100644
--- a/yt_dlp/downloader/common.py
+++ b/yt_dlp/downloader/common.py
@@ -4,6 +4,7 @@ import functools
import os
import random
import re
+import threading
import time
from ..minicurses import (
@@ -63,6 +64,7 @@ class FileDownloader:
min_filesize: Skip files smaller than this size
max_filesize: Skip files larger than this size
xattr_set_filesize: Set ytdl.filesize user xattribute with expected size.
+ progress_delta: The minimum time between progress output, in seconds
external_downloader_args: A dictionary of downloader keys (in lower case)
and a list of additional command-line arguments for the
executable. Use 'default' as the name for arguments to be
@@ -88,6 +90,9 @@ class FileDownloader:
self.params = params
self._prepare_multiline_status()
self.add_progress_hook(self.report_progress)
+ if self.params.get('progress_delta'):
+ self._progress_delta_lock = threading.Lock()
+ self._progress_delta_time = time.monotonic()
def _set_ydl(self, ydl):
self.ydl = ydl
@@ -366,6 +371,12 @@ class FileDownloader:
if s['status'] != 'downloading':
return
+ if update_delta := self.params.get('progress_delta'):
+ with self._progress_delta_lock:
+ if time.monotonic() < self._progress_delta_time:
+ return
+ self._progress_delta_time += update_delta
+
s.update({
'_eta_str': self.format_eta(s.get('eta')).strip(),
'_speed_str': self.format_speed(s.get('speed')),
diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py
index ce5eeb0..8b0b94e 100644
--- a/yt_dlp/downloader/external.py
+++ b/yt_dlp/downloader/external.py
@@ -491,7 +491,7 @@ class FFmpegFD(ExternalFD):
if not self.params.get('verbose'):
args += ['-hide_banner']
- args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args'), default=[])
+ args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args', ...))
# These exists only for compatibility. Extractors should use
# info_dict['downloader_options']['ffmpeg_args'] instead
@@ -615,6 +615,8 @@ class FFmpegFD(ExternalFD):
else:
args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)]
+ args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args_out', ...))
+
args += self._configuration_args(('_o1', '_o', ''))
args = [encodeArgument(opt) for opt in args]
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
index c753655..4203427 100644
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -150,6 +150,7 @@ from .arte import (
)
from .arnes import ArnesIE
from .asobichannel import AsobiChannelIE, AsobiChannelTagURLIE
+from .asobistage import AsobiStageIE
from .atresplayer import AtresPlayerIE
from .atscaleconf import AtScaleConfEventIE
from .atvat import ATVAtIE
@@ -590,6 +591,7 @@ from .facebook import (
FacebookReelIE,
FacebookAdsIE,
)
+from .fathom import FathomIE
from .fancode import (
FancodeVodIE,
FancodeLiveIE
@@ -874,6 +876,7 @@ from .jeuxvideo import JeuxVideoIE
from .jiosaavn import (
JioSaavnSongIE,
JioSaavnAlbumIE,
+ JioSaavnPlaylistIE,
)
from .jove import JoveIE
from .joj import JojIE
@@ -989,6 +992,10 @@ from .lnkgo import (
LnkGoIE,
LnkIE,
)
+from .loom import (
+ LoomIE,
+ LoomFolderIE,
+)
from .lovehomeporn import LoveHomePornIE
from .lrt import (
LRTVODIE,
@@ -1750,6 +1757,7 @@ from .shahid import (
ShahidIE,
ShahidShowIE,
)
+from .sharepoint import SharePointIE
from .sharevideos import ShareVideosEmbedIE
from .sibnet import SibnetEmbedIE
from .shemaroome import ShemarooMeIE
@@ -2283,6 +2291,7 @@ from .vrt import (
VrtNUIE,
KetnetIE,
DagelijkseKostIE,
+ Radio1BeIE,
)
from .vtm import VTMIE
from .medialaan import MedialaanIE
diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py
index 3d26d9c..2c33c90 100644
--- a/yt_dlp/extractor/afreecatv.py
+++ b/yt_dlp/extractor/afreecatv.py
@@ -1,25 +1,63 @@
import functools
-import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
OnDemandPagedList,
- date_from_str,
+ UserNotLive,
determine_ext,
+ filter_dict,
int_or_none,
- qualities,
- traverse_obj,
- unified_strdate,
unified_timestamp,
- update_url_query,
url_or_none,
urlencode_postdata,
- xpath_text,
)
+from ..utils.traversal import traverse_obj
-class AfreecaTVIE(InfoExtractor):
+class AfreecaTVBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'afreecatv'
+
+ def _perform_login(self, username, password):
+ login_form = {
+ 'szWork': 'login',
+ 'szType': 'json',
+ 'szUid': username,
+ 'szPassword': password,
+ 'isSaveId': 'false',
+ 'szScriptVar': 'oLoginRet',
+ 'szAction': '',
+ }
+
+ response = self._download_json(
+ 'https://login.afreecatv.com/app/LoginAction.php', None,
+ 'Logging in', data=urlencode_postdata(login_form))
+
+ _ERRORS = {
+ -4: 'Your account has been suspended due to a violation of our terms and policies.',
+ -5: 'https://member.afreecatv.com/app/user_delete_progress.php',
+ -6: 'https://login.afreecatv.com/membership/changeMember.php',
+ -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
+ -9: 'https://member.afreecatv.com/app/pop_login_block.php',
+ -11: 'https://login.afreecatv.com/afreeca/second_login.php',
+ -12: 'https://member.afreecatv.com/app/user_security.php',
+ 0: 'The username does not exist or you have entered the wrong password.',
+ -1: 'The username does not exist or you have entered the wrong password.',
+ -3: 'You have entered your username/password incorrectly.',
+ -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
+ -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
+ -32008: 'You have failed to log in. Please contact our Help Center.',
+ }
+
+ result = int_or_none(response.get('RESULT'))
+ if result != 1:
+ error = _ERRORS.get(result, 'You have failed to log in.')
+ raise ExtractorError(
+ 'Unable to login: %s said: %s' % (self.IE_NAME, error),
+ expected=True)
+
+
+class AfreecaTVIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv'
IE_DESC = 'afreecatv.com'
_VALID_URL = r'''(?x)
@@ -34,7 +72,6 @@ class AfreecaTVIE(InfoExtractor):
)
(?P<id>\d+)
'''
- _NETRC_MACHINE = 'afreecatv'
_TESTS = [{
'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
@@ -87,6 +124,7 @@ class AfreecaTVIE(InfoExtractor):
'uploader': '♥이슬이',
'uploader_id': 'dasl8121',
'upload_date': '20170411',
+ 'timestamp': 1491929865,
'duration': 213,
},
'params': {
@@ -120,219 +158,102 @@ class AfreecaTVIE(InfoExtractor):
'uploader_id': 'rlantnghks',
'uploader': '페이즈으',
'duration': 10840,
- 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r',
+ 'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+',
'upload_date': '20230108',
+ 'timestamp': 1673218805,
'title': '젠지 페이즈',
},
'params': {
'skip_download': True,
},
+ }, {
+ # adult content
+ 'url': 'https://vod.afreecatv.com/player/70395877',
+ 'only_matching': True,
+ }, {
+ # subscribers only
+ 'url': 'https://vod.afreecatv.com/player/104647403',
+ 'only_matching': True,
+ }, {
+ # private
+ 'url': 'https://vod.afreecatv.com/player/81669846',
+ 'only_matching': True,
}]
- @staticmethod
- def parse_video_key(key):
- video_key = {}
- m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
- if m:
- video_key['upload_date'] = m.group('upload_date')
- video_key['part'] = int(m.group('part'))
- return video_key
-
- def _perform_login(self, username, password):
- login_form = {
- 'szWork': 'login',
- 'szType': 'json',
- 'szUid': username,
- 'szPassword': password,
- 'isSaveId': 'false',
- 'szScriptVar': 'oLoginRet',
- 'szAction': '',
- }
-
- response = self._download_json(
- 'https://login.afreecatv.com/app/LoginAction.php', None,
- 'Logging in', data=urlencode_postdata(login_form))
-
- _ERRORS = {
- -4: 'Your account has been suspended due to a violation of our terms and policies.',
- -5: 'https://member.afreecatv.com/app/user_delete_progress.php',
- -6: 'https://login.afreecatv.com/membership/changeMember.php',
- -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
- -9: 'https://member.afreecatv.com/app/pop_login_block.php',
- -11: 'https://login.afreecatv.com/afreeca/second_login.php',
- -12: 'https://member.afreecatv.com/app/user_security.php',
- 0: 'The username does not exist or you have entered the wrong password.',
- -1: 'The username does not exist or you have entered the wrong password.',
- -3: 'You have entered your username/password incorrectly.',
- -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
- -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
- -32008: 'You have failed to log in. Please contact our Help Center.',
- }
-
- result = int_or_none(response.get('RESULT'))
- if result != 1:
- error = _ERRORS.get(result, 'You have failed to log in.')
- raise ExtractorError(
- 'Unable to login: %s said: %s' % (self.IE_NAME, error),
- expected=True)
-
def _real_extract(self, url):
video_id = self._match_id(url)
-
- partial_view = False
- adult_view = False
- for _ in range(2):
- data = self._download_json(
- 'https://api.m.afreecatv.com/station/video/a/view',
- video_id, headers={'Referer': url}, data=urlencode_postdata({
- 'nTitleNo': video_id,
- 'nApiLevel': 10,
- }))['data']
- if traverse_obj(data, ('code', {int})) == -6221:
- raise ExtractorError('The VOD does not exist', expected=True)
- query = {
+ data = self._download_json(
+ 'https://api.m.afreecatv.com/station/video/a/view', video_id,
+ headers={'Referer': url}, data=urlencode_postdata({
'nTitleNo': video_id,
- 'nStationNo': data['station_no'],
- 'nBbsNo': data['bbs_no'],
- }
- if partial_view:
- query['partialView'] = 'SKIP_ADULT'
- if adult_view:
- query['adultView'] = 'ADULT_VIEW'
- video_xml = self._download_xml(
- 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
- video_id, 'Downloading video info XML%s'
- % (' (skipping adult)' if partial_view else ''),
- video_id, headers={
- 'Referer': url,
- }, query=query)
-
- flag = xpath_text(video_xml, './track/flag', 'flag', default=None)
- if flag and flag == 'SUCCEED':
- break
- if flag == 'PARTIAL_ADULT':
- self.report_warning(
- 'In accordance with local laws and regulations, underage users are restricted from watching adult content. '
- 'Only content suitable for all ages will be downloaded. '
- 'Provide account credentials if you wish to download restricted content.')
- partial_view = True
- continue
- elif flag == 'ADULT':
- if not adult_view:
- adult_view = True
- continue
- error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.'
- else:
- error = flag
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, error), expected=True)
- else:
- raise ExtractorError('Unable to download video info')
-
- video_element = video_xml.findall('./track/video')[-1]
- if video_element is None or video_element.text is None:
- raise ExtractorError(
- 'Video %s does not exist' % video_id, expected=True)
-
- video_url = video_element.text.strip()
-
- title = xpath_text(video_xml, './track/title', 'title', fatal=True)
-
- uploader = xpath_text(video_xml, './track/nickname', 'uploader')
- uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
- duration = int_or_none(xpath_text(
- video_xml, './track/duration', 'duration'))
- thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
-
- common_entry = {
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'thumbnail': thumbnail,
- }
-
- info = common_entry.copy()
- info.update({
- 'id': video_id,
- 'title': title,
- 'duration': duration,
+ 'nApiLevel': 10,
+ }))['data']
+
+ error_code = traverse_obj(data, ('code', {int}))
+ if error_code == -6221:
+ raise ExtractorError('The VOD does not exist', expected=True)
+ elif error_code == -6205:
+ raise ExtractorError('This VOD is private', expected=True)
+
+ common_info = traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'uploader': ('writer_nick', {str}),
+ 'uploader_id': ('bj_id', {str}),
+ 'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}),
+ 'thumbnail': ('thumb', {url_or_none}),
})
- if not video_url:
- entries = []
- file_elements = video_element.findall('./file')
- one = len(file_elements) == 1
- for file_num, file_element in enumerate(file_elements, start=1):
- file_url = url_or_none(file_element.text)
- if not file_url:
- continue
- key = file_element.get('key', '')
- upload_date = unified_strdate(self._search_regex(
- r'^(\d{8})_', key, 'upload date', default=None))
- if upload_date is not None:
- # sometimes the upload date isn't included in the file name
- # instead, another random ID is, which may parse as a valid
- # date but be wildly out of a reasonable range
- parsed_date = date_from_str(upload_date)
- if parsed_date.year < 2000 or parsed_date.year >= 2100:
- upload_date = None
- file_duration = int_or_none(file_element.get('duration'))
- format_id = key if key else '%s_%s' % (video_id, file_num)
- if determine_ext(file_url) == 'm3u8':
- formats = self._extract_m3u8_formats(
- file_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls',
- note='Downloading part %d m3u8 information' % file_num)
- else:
- formats = [{
- 'url': file_url,
- 'format_id': 'http',
- }]
- if not formats and not self.get_param('ignore_no_formats'):
- continue
- file_info = common_entry.copy()
- file_info.update({
- 'id': format_id,
- 'title': title if one else '%s (part %d)' % (title, file_num),
- 'upload_date': upload_date,
- 'duration': file_duration,
- 'formats': formats,
+ entries = []
+ for file_num, file_element in enumerate(
+ traverse_obj(data, ('files', lambda _, v: url_or_none(v['file']))), start=1):
+ file_url = file_element['file']
+ if determine_ext(file_url) == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ file_url, video_id, 'mp4', m3u8_id='hls',
+ note=f'Downloading part {file_num} m3u8 information')
+ else:
+ formats = [{
+ 'url': file_url,
+ 'format_id': 'http',
+ }]
+
+ entries.append({
+ **common_info,
+ 'id': file_element.get('file_info_key') or f'{video_id}_{file_num}',
+ 'title': f'{common_info.get("title") or "Untitled"} (part {file_num})',
+ 'formats': formats,
+ **traverse_obj(file_element, {
+ 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
+ 'timestamp': ('file_start', {unified_timestamp}),
})
- entries.append(file_info)
- entries_info = info.copy()
- entries_info.update({
- '_type': 'multi_video',
- 'entries': entries,
})
- return entries_info
-
- info = {
- 'id': video_id,
- 'title': title,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'duration': duration,
- 'thumbnail': thumbnail,
- }
- if determine_ext(video_url) == 'm3u8':
- info['formats'] = self._extract_m3u8_formats(
- video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
- else:
- app, playpath = video_url.split('mp4:')
- info.update({
- 'url': app,
- 'ext': 'flv',
- 'play_path': 'mp4:' + playpath,
- 'rtmp_live': True, # downloading won't end without this
- })
+ if traverse_obj(data, ('adult_status', {str})) == 'notLogin':
+ if not entries:
+ self.raise_login_required(
+ 'Only users older than 19 are able to watch this video', method='password')
+ self.report_warning(
+ 'In accordance with local laws and regulations, underage users are '
+ 'restricted from watching adult content. Only content suitable for all '
+ f'ages will be downloaded. {self._login_hint("password")}')
+
+ if not entries and traverse_obj(data, ('sub_upload_type', {str})):
+ self.raise_login_required('This VOD is for subscribers only', method='password')
+
+ if len(entries) == 1:
+ return {
+ **entries[0],
+ 'title': common_info.get('title'),
+ }
- return info
+ common_info['timestamp'] = traverse_obj(entries, (..., 'timestamp'), get_all=False)
+ return self.playlist_result(entries, video_id, multi_video=True, **common_info)
-class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
+class AfreecaTVLiveIE(AfreecaTVBaseIE):
IE_NAME = 'afreecatv:live'
+ IE_DESC = 'afreecatv.com livestreams'
_VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?'
_TESTS = [{
'url': 'https://play.afreecatv.com/pyh3646/237852185',
@@ -347,77 +268,57 @@ class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
},
'skip': 'Livestream has ended',
}, {
- 'url': 'http://play.afreeca.com/pyh3646/237852185',
+ 'url': 'https://play.afreecatv.com/pyh3646/237852185',
'only_matching': True,
}, {
- 'url': 'http://play.afreeca.com/pyh3646',
+ 'url': 'https://play.afreecatv.com/pyh3646',
'only_matching': True,
}]
_LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php'
- _QUALITIES = ('sd', 'hd', 'hd2k', 'original')
-
def _real_extract(self, url):
broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno')
- password = self.get_param('videopassword')
+ channel_info = traverse_obj(self._download_json(
+ self._LIVE_API_URL, broadcaster_id, data=urlencode_postdata({'bid': broadcaster_id})),
+ ('CHANNEL', {dict})) or {}
- info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False,
- data=urlencode_postdata({'bid': broadcaster_id})) or {}
- channel_info = info.get('CHANNEL') or {}
broadcaster_id = channel_info.get('BJID') or broadcaster_id
broadcast_no = channel_info.get('BNO') or broadcast_no
- password_protected = channel_info.get('BPWD')
if not broadcast_no:
- raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True)
- if password_protected == 'Y' and password is None:
+ raise UserNotLive(video_id=broadcaster_id)
+
+ password = self.get_param('videopassword')
+ if channel_info.get('BPWD') == 'Y' and password is None:
raise ExtractorError(
'This livestream is protected by a password, use the --video-password option',
expected=True)
- formats = []
- quality_key = qualities(self._QUALITIES)
- for quality_str in self._QUALITIES:
- params = {
+ aid = self._download_json(
+ self._LIVE_API_URL, broadcast_no, 'Downloading access token for stream',
+ 'Unable to download access token for stream', data=urlencode_postdata(filter_dict({
'bno': broadcast_no,
'stream_type': 'common',
'type': 'aid',
- 'quality': quality_str,
- }
- if password is not None:
- params['pwd'] = password
- aid_response = self._download_json(
- self._LIVE_API_URL, broadcast_no, fatal=False,
- data=urlencode_postdata(params),
- note=f'Downloading access token for {quality_str} stream',
- errnote=f'Unable to download access token for {quality_str} stream')
- aid = traverse_obj(aid_response, ('CHANNEL', 'AID'))
- if not aid:
- continue
-
- stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
- stream_info = self._download_json(
- f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False,
- query={
- 'return_type': channel_info.get('CDN', 'gcp_cdn'),
- 'broad_key': f'{broadcast_no}-common-{quality_str}-hls',
- },
- note=f'Downloading metadata for {quality_str} stream',
- errnote=f'Unable to download metadata for {quality_str} stream') or {}
-
- if stream_info.get('view_url'):
- formats.append({
- 'format_id': quality_str,
- 'url': update_url_query(stream_info['view_url'], {'aid': aid}),
- 'ext': 'mp4',
- 'protocol': 'm3u8',
- 'quality': quality_key(quality_str),
- })
-
- station_info = self._download_json(
+ 'quality': 'master',
+ 'pwd': password,
+ })))['CHANNEL']['AID']
+
+ stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com'
+ stream_info = self._download_json(f'{stream_base_url}/broad_stream_assign.html', broadcast_no, query={
+ # works: gs_cdn_pc_app, gs_cdn_mobile_web, gs_cdn_pc_web
+ 'return_type': 'gs_cdn_pc_app',
+ 'broad_key': f'{broadcast_no}-common-master-hls',
+ }, note='Downloading metadata for stream', errnote='Unable to download metadata for stream')
+
+ formats = self._extract_m3u8_formats(
+ stream_info['view_url'], broadcast_no, 'mp4', m3u8_id='hls',
+ query={'aid': aid}, headers={'Referer': url})
+
+ station_info = traverse_obj(self._download_json(
'https://st.afreecatv.com/api/get_station_status.php', broadcast_no,
- query={'szBjId': broadcaster_id}, fatal=False,
- note='Downloading channel metadata', errnote='Unable to download channel metadata') or {}
+ 'Downloading channel metadata', 'Unable to download channel metadata',
+ query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {}
return {
'id': broadcast_no,
@@ -427,6 +328,7 @@ class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE
'timestamp': unified_timestamp(station_info.get('broad_start')),
'formats': formats,
'is_live': True,
+ 'http_headers': {'Referer': url},
}
diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py
index 46e68d6..3db59c5 100644
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@@ -1,5 +1,5 @@
+import functools
import re
-from functools import partial
from .common import InfoExtractor
from ..utils import (
@@ -349,7 +349,7 @@ class ARDBetaMediathekIE(InfoExtractor):
r'(?P<title>.*)',
]
- return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
+ return traverse_obj(patterns, (..., {functools.partial(re.match, string=title)}, {
'season_number': ('season_number', {int_or_none}),
'episode_number': ('episode_number', {int_or_none}),
'episode': ((
diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py
new file mode 100644
index 0000000..b088a1b
--- /dev/null
+++ b/yt_dlp/extractor/asobistage.py
@@ -0,0 +1,154 @@
+import functools
+
+from .common import InfoExtractor
+from ..utils import str_or_none, url_or_none
+from ..utils.traversal import traverse_obj
+
+
+class AsobiStageIE(InfoExtractor):
+ IE_DESC = 'ASOBISTAGE (アソビステージ)'
+ _VALID_URL = r'https?://asobistage\.asobistore\.jp/event/(?P<id>(?P<event>\w+)/(?P<type>archive|player)/(?P<slug>\w+))(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://asobistage.asobistore.jp/event/315passionhour_2022summer/archive/frame',
+ 'info_dict': {
+ 'id': '315passionhour_2022summer/archive/frame',
+ 'title': '315プロダクションプレゼンツ 315パッションアワー!!!',
+ 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
+ },
+ 'playlist_count': 1,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'edff52f2',
+ 'ext': 'mp4',
+ 'title': '315passion_FRAME_only',
+ 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
+ },
+ }],
+ }, {
+ 'url': 'https://asobistage.asobistore.jp/event/idolmaster_idolworld2023_goods/archive/live',
+ 'info_dict': {
+ 'id': 'idolmaster_idolworld2023_goods/archive/live',
+ 'title': 'md5:378510b6e830129d505885908bd6c576',
+ 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
+ },
+ 'playlist_count': 1,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '3aef7110',
+ 'ext': 'mp4',
+ 'title': 'asobistore_station_1020_serverREC',
+ 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
+ },
+ }],
+ }, {
+ 'url': 'https://asobistage.asobistore.jp/event/sidem_fclive_bpct/archive/premium_hc',
+ 'playlist_count': 4,
+ 'info_dict': {
+ 'id': 'sidem_fclive_bpct/archive/premium_hc',
+ 'title': '315 Production presents F@NTASTIC COMBINATION LIVE ~BRAINPOWER!!~/~CONNECTIME!!!!~',
+ 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+',
+ },
+ }, {
+ 'url': 'https://asobistage.asobistore.jp/event/ijigenfes_utagassen/player/day1',
+ 'only_matching': True,
+ }]
+
+ _API_HOST = 'https://asobistage-api.asobistore.jp'
+ _HEADERS = {}
+ _is_logged_in = False
+
+ @functools.cached_property
+ def _owned_tickets(self):
+ owned_tickets = set()
+ if not self._is_logged_in:
+ return owned_tickets
+
+ for path, name in [
+ ('api/v1/purchase_history/list', 'ticket purchase history'),
+ ('api/v1/serialcode/list', 'redemption history'),
+ ]:
+ response = self._download_json(
+ f'{self._API_HOST}/{path}', None, f'Downloading {name}',
+ f'Unable to download {name}', expected_status=400)
+ if traverse_obj(response, ('payload', 'error_message'), 'error') == 'notlogin':
+ self._is_logged_in = False
+ break
+ owned_tickets.update(
+ traverse_obj(response, ('payload', 'value', ..., 'digital_product_id', {str_or_none})))
+
+ return owned_tickets
+
+ def _get_available_channel_id(self, channel):
+ channel_id = traverse_obj(channel, ('chennel_vspf_id', {str}))
+ if not channel_id:
+ return None
+ # if rights_type_id == 6, then 'No conditions (no login required - non-members are OK)'
+ if traverse_obj(channel, ('viewrights', lambda _, v: v['rights_type_id'] == 6)):
+ return channel_id
+ available_tickets = traverse_obj(channel, (
+ 'viewrights', ..., ('tickets', 'serialcodes'), ..., 'digital_product_id', {str_or_none}))
+ if not self._owned_tickets.intersection(available_tickets):
+ self.report_warning(
+ f'You are not a ticketholder for "{channel.get("channel_name") or channel_id}"')
+ return None
+ return channel_id
+
+ def _real_initialize(self):
+ if self._get_cookies(self._API_HOST):
+ self._is_logged_in = True
+ token = self._download_json(
+ f'{self._API_HOST}/api/v1/vspf/token', None, 'Getting token', 'Unable to get token')
+ self._HEADERS['Authorization'] = f'Bearer {token}'
+
+ def _real_extract(self, url):
+ video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug')
+ video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
+ webpage = self._download_webpage(url, video_id)
+ event_data = traverse_obj(
+ self._search_nextjs_data(webpage, video_id, default='{}'),
+ ('props', 'pageProps', 'eventCMSData', {
+ 'title': ('event_name', {str}),
+ 'thumbnail': ('event_thumbnail_image', {url_or_none}),
+ }))
+
+ available_channels = traverse_obj(self._download_json(
+ f'https://asobistage.asobistore.jp/cdn/v101/events/{event}/{video_type}.json',
+ video_id, 'Getting channel list', 'Unable to get channel list'), (
+ video_type, lambda _, v: v['broadcast_slug'] == slug,
+ 'channels', lambda _, v: v['chennel_vspf_id'] != '00000'))
+
+ entries = []
+ for channel_id in traverse_obj(available_channels, (..., {self._get_available_channel_id})):
+ if video_type == 'archives':
+ channel_json = self._download_json(
+ f'https://survapi.channel.or.jp/proxy/v1/contents/{channel_id}/get_by_cuid', channel_id,
+ 'Getting archive channel info', 'Unable to get archive channel info', fatal=False,
+ headers=self._HEADERS)
+ channel_data = traverse_obj(channel_json, ('ex_content', {
+ 'm3u8_url': 'streaming_url',
+ 'title': 'title',
+ 'thumbnail': ('thumbnail', 'url'),
+ }))
+ else: # video_type == 'broadcasts'
+ channel_json = self._download_json(
+ f'https://survapi.channel.or.jp/ex/events/{channel_id}', channel_id,
+ 'Getting live channel info', 'Unable to get live channel info', fatal=False,
+ headers=self._HEADERS, query={'embed': 'channel'})
+ channel_data = traverse_obj(channel_json, ('data', {
+ 'm3u8_url': ('Channel', 'Custom_live_url'),
+ 'title': 'Name',
+ 'thumbnail': 'Poster_url',
+ }))
+
+ entries.append({
+ 'id': channel_id,
+ 'title': channel_data.get('title'),
+ 'formats': self._extract_m3u8_formats(channel_data.get('m3u8_url'), channel_id, fatal=False),
+ 'is_live': video_type == 'broadcasts',
+ 'thumbnail': url_or_none(channel_data.get('thumbnail')),
+ })
+
+ if not self._is_logged_in and not entries:
+ self.raise_login_required()
+
+ return self.playlist_result(entries, video_id, **event_data)
diff --git a/yt_dlp/extractor/atvat.py b/yt_dlp/extractor/atvat.py
index d6ed9e4..d60feba 100644
--- a/yt_dlp/extractor/atvat.py
+++ b/yt_dlp/extractor/atvat.py
@@ -1,4 +1,4 @@
-import datetime
+import datetime as dt
from .common import InfoExtractor
from ..utils import (
@@ -71,9 +71,9 @@ class ATVAtIE(InfoExtractor):
content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']}
for id, content in enumerate(contentResource)]
- time_of_request = datetime.datetime.now()
- not_before = time_of_request - datetime.timedelta(minutes=5)
- expire = time_of_request + datetime.timedelta(minutes=5)
+ time_of_request = dt.datetime.now()
+ not_before = time_of_request - dt.timedelta(minutes=5)
+ expire = time_of_request + dt.timedelta(minutes=5)
payload = {
'content_ids': {
content_id: content_ids,
diff --git a/yt_dlp/extractor/aws.py b/yt_dlp/extractor/aws.py
index c4741a6..4ebef92 100644
--- a/yt_dlp/extractor/aws.py
+++ b/yt_dlp/extractor/aws.py
@@ -1,4 +1,4 @@
-import datetime
+import datetime as dt
import hashlib
import hmac
@@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with
def _aws_execute_api(self, aws_dict, video_id, query=None):
query = query or {}
- amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
+ amz_date = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%dT%H%M%SZ')
date = amz_date[:8]
headers = {
'Accept': 'application/json',
diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py
index 34464da..666b51c 100644
--- a/yt_dlp/extractor/bibeltv.py
+++ b/yt_dlp/extractor/bibeltv.py
@@ -1,4 +1,4 @@
-from functools import partial
+import functools
from .common import InfoExtractor
from ..utils import (
@@ -50,7 +50,7 @@ class BibelTVBaseIE(InfoExtractor):
**traverse_obj(data, {
'title': 'title',
'description': 'description',
- 'duration': ('duration', {partial(int_or_none, scale=1000)}),
+ 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}),
'timestamp': ('schedulingStart', {parse_iso8601}),
'season_number': 'seasonNumber',
'episode_number': 'episodeNumber',
diff --git a/yt_dlp/extractor/box.py b/yt_dlp/extractor/box.py
index 7281b3c..008c011 100644
--- a/yt_dlp/extractor/box.py
+++ b/yt_dlp/extractor/box.py
@@ -3,6 +3,7 @@ import urllib.parse
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
parse_iso8601,
update_url_query,
url_or_none,
@@ -11,8 +12,8 @@ from ..utils.traversal import traverse_obj
class BoxIE(InfoExtractor):
- _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)/file/(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/?#]+)(?:/file/(?P<id>\d+))?'
+ _TESTS = [{
'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
'info_dict': {
@@ -25,14 +26,36 @@ class BoxIE(InfoExtractor):
'uploader_id': '235196876',
},
'params': {'skip_download': 'dash fragment too small'},
- }
+ }, {
+ 'url': 'https://utexas.app.box.com/s/2x6vanv85fdl8j2eqlcxmv0gp1wvps6e',
+ 'info_dict': {
+ 'id': '787379022466',
+ 'ext': 'mp4',
+ 'title': 'Webinar recording: Take the Leap!.mp4',
+ 'uploader': 'Patricia Mosele',
+ 'timestamp': 1615824864,
+ 'upload_date': '20210315',
+ 'uploader_id': '239068974',
+ },
+ 'params': {'skip_download': 'dash fragment too small'},
+ }]
def _real_extract(self, url):
shared_name, file_id = self._match_valid_url(url).groups()
- webpage = self._download_webpage(url, file_id)
- request_token = self._parse_json(self._search_regex(
- r'Box\.config\s*=\s*({.+?});', webpage,
- 'Box config'), file_id)['requestToken']
+ webpage = self._download_webpage(url, file_id or shared_name)
+
+ if not file_id:
+ post_stream_data = self._search_json(
+ r'Box\.postStreamData\s*=', webpage, 'Box post-stream data', shared_name)
+ shared_item = traverse_obj(
+ post_stream_data, ('/app-api/enduserapp/shared-item', {dict})) or {}
+ if shared_item.get('itemType') != 'file':
+ raise ExtractorError('The requested resource is not a file', expected=True)
+
+ file_id = str(shared_item['itemID'])
+
+ request_token = self._search_json(
+ r'Box\.config\s*=', webpage, 'Box config', file_id)['requestToken']
access_token = self._download_json(
'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
'Downloading token JSON metadata',
diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py
index 9fd7c7d..71f7726 100644
--- a/yt_dlp/extractor/bundestag.py
+++ b/yt_dlp/extractor/bundestag.py
@@ -1,5 +1,5 @@
+import functools
import re
-from functools import partial
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
@@ -115,9 +115,9 @@ class BundestagIE(InfoExtractor):
note='Downloading metadata overlay', fatal=False,
), {
'title': (
- {partial(get_element_text_and_html_by_tag, 'h3')}, 0,
- {partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
- 'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
+ {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
+ {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
+ 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
}))
return result
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
index b5beb1e..ff320dd 100644
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -151,7 +151,7 @@ class CBCIE(InfoExtractor):
class CBCPlayerIE(InfoExtractor):
IE_NAME = 'cbc.ca:player'
- _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
+ _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)'
_TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
'md5': '64d25f841ddf4ddb28a235338af32e2c',
@@ -166,8 +166,51 @@ class CBCPlayerIE(InfoExtractor):
},
'skip': 'Geo-restricted to Canada and no longer available',
}, {
+ 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2657631896',
+ 'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
+ 'info_dict': {
+ 'id': '2657631896',
+ 'ext': 'mp3',
+ 'title': 'CBC Montreal is organizing its first ever community hackathon!',
+ 'description': 'md5:dd3b692f0a139b0369943150bd1c46a9',
+ 'timestamp': 1425704400,
+ 'upload_date': '20150307',
+ 'uploader': 'CBCC-NEW',
+ 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg',
+ 'chapters': [],
+ 'duration': 494.811,
+ 'categories': ['AudioMobile/All in a Weekend Montreal'],
+ 'tags': 'count:8',
+ 'location': 'Quebec',
+ 'series': 'All in a Weekend Montreal',
+ 'season': 'Season 2015',
+ 'season_number': 2015,
+ 'media_type': 'Excerpt',
+ },
+ }, {
+ 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062',
+ 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
+ 'info_dict': {
+ 'id': '2164402062',
+ 'ext': 'mp4',
+ 'title': 'Cancer survivor four times over',
+ 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
+ 'timestamp': 1320410746,
+ 'upload_date': '20111104',
+ 'uploader': 'CBCC-NEW',
+ 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg',
+ 'chapters': [],
+ 'duration': 186.867,
+ 'series': 'CBC News: Windsor at 6:00',
+ 'categories': ['News/Canada/Windsor'],
+ 'location': 'Windsor',
+ 'tags': ['cancer'],
+ 'creators': ['Allison Johnson'],
+ 'media_type': 'Excerpt',
+ },
+ }, {
# Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
- 'url': 'http://www.cbc.ca/player/play/2657631896',
+ 'url': 'https://www.cbc.ca/player/play/1.2985700',
'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
'info_dict': {
'id': '2657631896',
@@ -189,7 +232,7 @@ class CBCPlayerIE(InfoExtractor):
'media_type': 'Excerpt',
},
}, {
- 'url': 'http://www.cbc.ca/player/play/2164402062',
+ 'url': 'https://www.cbc.ca/player/play/1.1711287',
'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
@@ -206,38 +249,53 @@ class CBCPlayerIE(InfoExtractor):
'categories': ['News/Canada/Windsor'],
'location': 'Windsor',
'tags': ['cancer'],
- 'creator': 'Allison Johnson',
+ 'creators': ['Allison Johnson'],
'media_type': 'Excerpt',
},
}, {
# Has subtitles
# These broadcasts expire after ~1 month, can find new test URL here:
# https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast
- 'url': 'http://www.cbc.ca/player/play/2284799043667',
- 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5',
+ 'url': 'https://www.cbc.ca/player/play/1.7159484',
+ 'md5': '6ed6cd0fc2ef568d2297ba68a763d455',
'info_dict': {
- 'id': '2284799043667',
+ 'id': '2324213316001',
'ext': 'mp4',
- 'title': 'The National | Hockey coach charged, Green grants, Safer drugs',
- 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa',
- 'timestamp': 1700272800,
- 'duration': 2718.833,
+ 'title': 'The National | School boards sue social media giants',
+ 'description': 'md5:4b4db69322fa32186c3ce426da07402c',
+ 'timestamp': 1711681200,
+ 'duration': 2743.400,
'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]},
- 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg',
+ 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg',
'uploader': 'CBCC-NEW',
'chapters': 'count:5',
- 'upload_date': '20231118',
+ 'upload_date': '20240329',
'categories': 'count:4',
'series': 'The National - Full Show',
'tags': 'count:1',
- 'creator': 'News',
+ 'creators': ['News'],
'location': 'Canada',
'media_type': 'Full Program',
},
+ }, {
+ 'url': 'cbcplayer:1.7159484',
+ 'only_matching': True,
+ }, {
+ 'url': 'cbcplayer:2164402062',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.cbc.ca/player/play/2657631896',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ if '.' in video_id:
+ webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id)
+ video_id = self._search_json(
+ r'window\.__INITIAL_STATE__\s*=', webpage,
+ 'initial state', video_id)['video']['currentClip']['mediaId']
+
return {
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py
index 1157114..90b4d08 100644
--- a/yt_dlp/extractor/cda.py
+++ b/yt_dlp/extractor/cda.py
@@ -1,6 +1,6 @@
import base64
import codecs
-import datetime
+import datetime as dt
import hashlib
import hmac
import json
@@ -134,7 +134,7 @@ class CDAIE(InfoExtractor):
self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})'
cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {}
- if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5:
+ if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5:
self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}'
return
@@ -154,7 +154,7 @@ class CDAIE(InfoExtractor):
})
self.cache.store(self._BEARER_CACHE, username, {
'token': token_res['access_token'],
- 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(),
+ 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(),
})
self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}'
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index e776cca..57bbf9b 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -37,6 +37,7 @@ from ..networking.exceptions import (
IncompleteRead,
network_exceptions,
)
+from ..networking.impersonate import ImpersonateTarget
from ..utils import (
IDENTITY,
JSON_LD_RE,
@@ -170,12 +171,12 @@ class InfoExtractor:
Automatically calculated from width and height
* dynamic_range The dynamic range of the video. One of:
"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
- * tbr Average bitrate of audio and video in KBit/s
- * abr Average audio bitrate in KBit/s
+ * tbr Average bitrate of audio and video in kbps (1000 bits/sec)
+ * abr Average audio bitrate in kbps (1000 bits/sec)
* acodec Name of the audio codec in use
* asr Audio sampling rate in Hertz
* audio_channels Number of audio channels
- * vbr Average video bitrate in KBit/s
+ * vbr Average video bitrate in kbps (1000 bits/sec)
* fps Frame rate
* vcodec Name of the video codec in use
* container Name of the container format
@@ -246,7 +247,8 @@ class InfoExtractor:
* downloader_options A dictionary of downloader options
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
- * ffmpeg_args Extra arguments for ffmpeg downloader
+ * ffmpeg_args Extra arguments for ffmpeg downloader (input)
+ * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
* is_dash_periods Whether the format is a result of merging
multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
@@ -817,7 +819,7 @@ class InfoExtractor:
else:
return err.status in variadic(expected_status)
- def _create_request(self, url_or_request, data=None, headers=None, query=None):
+ def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
if isinstance(url_or_request, urllib.request.Request):
self._downloader.deprecation_warning(
'Passing a urllib.request.Request to _create_request() is deprecated. '
@@ -826,10 +828,11 @@ class InfoExtractor:
elif not isinstance(url_or_request, Request):
url_or_request = Request(url_or_request)
- url_or_request.update(data=data, headers=headers, query=query)
+ url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
return url_or_request
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
+ headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
"""
Return the response handle.
@@ -860,8 +863,31 @@ class InfoExtractor:
headers = (headers or {}).copy()
headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
+ extensions = {}
+
+ if impersonate in (True, ''):
+ impersonate = ImpersonateTarget()
+ requested_targets = [
+ t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
+ for t in variadic(impersonate)
+ ] if impersonate else []
+
+ available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
+ if available_target:
+ extensions['impersonate'] = available_target
+ elif requested_targets:
+ message = 'The extractor is attempting impersonation, but '
+ message += (
+ 'no impersonate target is available' if not str(impersonate)
+ else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
+ info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation '
+ 'for information on installing the required dependencies')
+ if require_impersonation:
+ raise ExtractorError(f'{message}; {info_msg}', expected=True)
+ self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
+
try:
- return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
+ return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
except network_exceptions as err:
if isinstance(err, HTTPError):
if self.__can_accept_status_code(err, expected_status):
@@ -880,13 +906,14 @@ class InfoExtractor:
return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
- encoding=None, data=None, headers={}, query={}, expected_status=None):
+ encoding=None, data=None, headers={}, query={}, expected_status=None,
+ impersonate=None, require_impersonation=False):
"""
Return a tuple (page content as string, URL handle).
Arguments:
url_or_request -- plain text URL as a string or
- a urllib.request.Request object
+ a yt_dlp.networking.Request object
video_id -- Video/playlist/item identifier (string)
Keyword arguments:
@@ -911,13 +938,22 @@ class InfoExtractor:
returning True if it should be accepted
Note that this argument does not affect success status codes (2xx)
which are always accepted.
+ impersonate -- the impersonate target. Can be any of the following entities:
+ - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
+ - a string in the format of CLIENT[:OS]
+ - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
+ - a boolean value; True means any impersonate target is sufficient
+ require_impersonation -- flag to toggle whether the request should raise an error
+ if impersonation is not possible (bool, default: False)
"""
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, str):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
+ headers=headers, query=query, expected_status=expected_status,
+ impersonate=impersonate, require_impersonation=require_impersonation)
if urlh is False:
assert not fatal
return False
@@ -1046,17 +1082,20 @@ class InfoExtractor:
return getattr(ie, parser)(content, *args, **kwargs)
def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
+ impersonate=None, require_impersonation=False):
res = self._download_webpage_handle(
url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query, expected_status=expected_status)
+ data=data, headers=headers, query=query, expected_status=expected_status,
+ impersonate=impersonate, require_impersonation=require_impersonation)
if res is False:
return res
content, urlh = res
return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
+ impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
filename = self._request_dump_filename(url_or_request.url, video_id)
@@ -1079,6 +1118,8 @@ class InfoExtractor:
'headers': headers,
'query': query,
'expected_status': expected_status,
+ 'impersonate': impersonate,
+ 'require_impersonation': require_impersonation,
}
if parser is None:
kwargs.pop('transform_source')
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py
index 8d997de..118b575 100644
--- a/yt_dlp/extractor/crunchyroll.py
+++ b/yt_dlp/extractor/crunchyroll.py
@@ -1,4 +1,5 @@
import base64
+import uuid
from .common import InfoExtractor
from ..networking.exceptions import HTTPError
@@ -7,12 +8,11 @@ from ..utils import (
float_or_none,
format_field,
int_or_none,
- join_nonempty,
+ jwt_decode_hs256,
parse_age_limit,
parse_count,
parse_iso8601,
qualities,
- remove_start,
time_seconds,
traverse_obj,
url_or_none,
@@ -27,6 +27,7 @@ class CrunchyrollBaseIE(InfoExtractor):
_AUTH_HEADERS = None
_API_ENDPOINT = None
_BASIC_AUTH = None
+ _IS_PREMIUM = None
_CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
_LOCALE_LOOKUP = {
'ar': 'ar-SA',
@@ -84,11 +85,16 @@ class CrunchyrollBaseIE(InfoExtractor):
self.write_debug(f'Using cxApiParam={cx_api_param}')
CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
- grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id'
+ auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
+ if self.is_logged_in:
+ grant_type = 'etp_rt_cookie'
+ else:
+ grant_type = 'client_id'
+ auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try:
auth_response = self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}',
- headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode())
+ headers=auth_headers, data=f'grant_type={grant_type}'.encode())
except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 403:
raise ExtractorError(
@@ -97,6 +103,7 @@ class CrunchyrollBaseIE(InfoExtractor):
'and your browser\'s User-Agent (with --user-agent)', expected=True)
raise
+ CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']}
CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10)
@@ -135,62 +142,72 @@ class CrunchyrollBaseIE(InfoExtractor):
raise ExtractorError(f'Unexpected response when downloading {note} JSON')
return result
- def _extract_formats(self, stream_response, display_id=None):
- requested_formats = self._configuration_arg('format') or ['adaptive_hls']
- available_formats = {}
- for stream_type, streams in traverse_obj(
- stream_response, (('streams', ('data', 0)), {dict.items}, ...)):
- if stream_type not in requested_formats:
+ def _extract_chapters(self, internal_id):
+ # if no skip events are available, a 403 xml error is returned
+ skip_events = self._download_json(
+ f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json',
+ internal_id, note='Downloading chapter info', fatal=False, errnote=False)
+ if not skip_events:
+ return None
+
+ chapters = []
+ for event in ('recap', 'intro', 'credits', 'preview'):
+ start = traverse_obj(skip_events, (event, 'start', {float_or_none}))
+ end = traverse_obj(skip_events, (event, 'end', {float_or_none}))
+ # some chapters have no start and/or ending time, they will just be ignored
+ if start is None or end is None:
continue
- for stream in traverse_obj(streams, lambda _, v: v['url']):
- hardsub_lang = stream.get('hardsub_locale') or ''
- format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
- available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url'])
+ chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end})
+
+ return chapters
+
+ def _extract_stream(self, identifier, display_id=None):
+ if not display_id:
+ display_id = identifier
+
+ self._update_auth()
+ stream_response = self._download_json(
+ f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
+ display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS)
+
+ available_formats = {'': ('', '', stream_response['url'])}
+ for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
+ available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url'])
requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
- if '' in available_formats and 'all' not in requested_hardsubs:
+ hardsub_langs = [lang for lang in available_formats if lang]
+ if hardsub_langs and 'all' not in requested_hardsubs:
full_format_langs = set(requested_hardsubs)
+ self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}')
self.to_screen(
- 'To get all formats of a hardsub language, use '
+ 'To extract formats of a hardsub language, use '
'"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". '
'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info',
only_once=True)
else:
full_format_langs = set(map(str.lower, available_formats))
- audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False)
+ audio_locale = traverse_obj(stream_response, ('audioLocale', {str}))
hardsub_preference = qualities(requested_hardsubs[::-1])
- formats = []
- for stream_type, format_id, hardsub_lang, stream_url in available_formats.values():
- if stream_type.endswith('hls'):
- if hardsub_lang.lower() in full_format_langs:
- adaptive_formats = self._extract_m3u8_formats(
- stream_url, display_id, 'mp4', m3u8_id=format_id,
- fatal=False, note=f'Downloading {format_id} HLS manifest')
- else:
- adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),)
- elif stream_type.endswith('dash'):
- adaptive_formats = self._extract_mpd_formats(
- stream_url, display_id, mpd_id=format_id,
- fatal=False, note=f'Downloading {format_id} MPD manifest')
+ formats, subtitles = [], {}
+ for format_id, hardsub_lang, stream_url in available_formats.values():
+ if hardsub_lang.lower() in full_format_langs:
+ adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles(
+ stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS,
+ fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest')
+ self._merge_subtitles(dash_subs, target=subtitles)
else:
- self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True)
- continue
+ continue # XXX: Update this if/when meta mpd formats are working
for f in adaptive_formats:
if f.get('acodec') != 'none':
f['language'] = audio_locale
f['quality'] = hardsub_preference(hardsub_lang.lower())
formats.extend(adaptive_formats)
- return formats
-
- def _extract_subtitles(self, data):
- subtitles = {}
+ for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)):
+ subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'}))
- for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)):
- subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})]
-
- return subtitles
+ return formats, subtitles
class CrunchyrollCmsBaseIE(CrunchyrollBaseIE):
@@ -245,7 +262,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
'like_count': int,
'dislike_count': int,
},
- 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'},
+ 'params': {
+ 'skip_download': 'm3u8',
+ 'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}},
+ 'format': 'bv[format_id~=hardsub]',
+ },
}, {
# Premium only
'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR',
@@ -306,6 +327,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
},
'params': {'skip_download': 'm3u8'},
+ 'skip': 'no longer exists',
}, {
'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6',
'info_dict': {
@@ -359,31 +381,15 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
else:
raise ExtractorError(f'Unknown object type {object_type}')
- # There might be multiple audio languages for one object (`<object>_metadata.versions`),
- # so we need to get the id from `streams_link` instead or we dont know which language to choose
- streams_link = response.get('streams_link')
- if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
+ if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only'
if self.is_logged_in:
raise ExtractorError(message, expected=True)
self.raise_login_required(message)
- # We need go from unsigned to signed api to avoid getting soft banned
- stream_response = self._call_cms_api_signed(remove_start(
- streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info')
- result['formats'] = self._extract_formats(stream_response, internal_id)
- result['subtitles'] = self._extract_subtitles(stream_response)
+ result['formats'], result['subtitles'] = self._extract_stream(internal_id)
- # if no intro chapter is available, a 403 without usable data is returned
- intro_chapter = self._download_json(
- f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json',
- internal_id, note='Downloading chapter info', fatal=False, errnote=False)
- if isinstance(intro_chapter, dict):
- result['chapters'] = [{
- 'title': 'Intro',
- 'start_time': float_or_none(intro_chapter.get('startTime')),
- 'end_time': float_or_none(intro_chapter.get('endTime')),
- }]
+ result['chapters'] = self._extract_chapters(internal_id)
def calculate_count(item):
return parse_count(''.join((item['displayed'], item.get('unit') or '')))
@@ -512,7 +518,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'egaono-hana',
'title': 'Egaono Hana',
'track': 'Egaono Hana',
- 'artist': 'Goose house',
+ 'artists': ['Goose house'],
'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
'genres': ['J-Pop'],
},
@@ -525,11 +531,12 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'crossing-field',
'title': 'Crossing Field',
'track': 'Crossing Field',
- 'artist': 'LiSA',
+ 'artists': ['LiSA'],
'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
'genres': ['Anime'],
},
'params': {'skip_download': 'm3u8'},
+ 'skip': 'no longer exists',
}, {
'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135',
'info_dict': {
@@ -538,7 +545,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'live-is-smile-always-364joker-at-yokohama-arena',
'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA',
- 'artist': 'LiSA',
+ 'artists': ['LiSA'],
'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$',
'description': 'md5:747444e7e6300907b7a43f0a0503072e',
'genres': ['J-Pop'],
@@ -566,16 +573,14 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
if not response:
raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True)
- streams_link = response.get('streams_link')
- if not streams_link and response.get('isPremiumOnly'):
+ if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only'
if self.is_logged_in:
raise ExtractorError(message, expected=True)
self.raise_login_required(message)
result = self._transform_music_response(response)
- stream_response = self._call_api(streams_link, internal_id, lang, 'stream info')
- result['formats'] = self._extract_formats(stream_response, internal_id)
+ result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)
return result
@@ -587,7 +592,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
'display_id': 'slug',
'title': 'title',
'track': 'title',
- 'artist': ('artist', 'name'),
+ 'artists': ('artist', 'name', all),
'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}),
'thumbnails': ('images', ..., ..., {
'url': ('source', {url_or_none}),
@@ -611,7 +616,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE):
'info_dict': {
'id': 'MA179CB50D',
'title': 'LiSA',
- 'genres': ['J-Pop', 'Anime', 'Rock'],
+ 'genres': ['Anime', 'J-Pop', 'Rock'],
'description': 'md5:16d87de61a55c3f7d6c454b73285938e',
},
'playlist_mincount': 83,
diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py
index bc2efce..0246975 100644
--- a/yt_dlp/extractor/dropbox.py
+++ b/yt_dlp/extractor/dropbox.py
@@ -65,12 +65,14 @@ class DropboxIE(InfoExtractor):
formats, subtitles, has_anonymous_download = [], {}, False
for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)):
decoded = base64.b64decode(encoded).decode('utf-8', 'ignore')
+ if not has_anonymous_download:
+ has_anonymous_download = self._search_regex(
+ r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False)
transcode_url = self._search_regex(
r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None)
if not transcode_url:
continue
formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4')
- has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False)
break
# downloads enabled we can get the original file
diff --git a/yt_dlp/extractor/dtube.py b/yt_dlp/extractor/dtube.py
index bb06c42..5ea014c 100644
--- a/yt_dlp/extractor/dtube.py
+++ b/yt_dlp/extractor/dtube.py
@@ -1,5 +1,5 @@
import json
-from socket import timeout
+import socket
from .common import InfoExtractor
from ..utils import (
@@ -56,7 +56,7 @@ class DTubeIE(InfoExtractor):
try:
self.to_screen('%s: Checking %s video format URL' % (video_id, format_id))
self._downloader._opener.open(video_url, timeout=5).close()
- except timeout:
+ except socket.timeout:
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, format_id))
continue
diff --git a/yt_dlp/extractor/fathom.py b/yt_dlp/extractor/fathom.py
new file mode 100644
index 0000000..1df7d96
--- /dev/null
+++ b/yt_dlp/extractor/fathom.py
@@ -0,0 +1,54 @@
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ extract_attributes,
+ float_or_none,
+ get_element_html_by_id,
+ parse_iso8601,
+)
+from ..utils.traversal import traverse_obj
+
+
+class FathomIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fathom\.video/share/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://fathom.video/share/G9mkjkspnohVVZ_L5nrsoPycyWcB8y7s',
+ 'md5': '0decd5343b8f30ae268625e79a02b60f',
+ 'info_dict': {
+ 'id': '47200596',
+ 'ext': 'mp4',
+ 'title': 'eCom Inucbator - Coaching Session',
+ 'duration': 8125.380507,
+ 'timestamp': 1699048914,
+ 'upload_date': '20231103',
+ },
+ }, {
+ 'url': 'https://fathom.video/share/mEws3bybftHL2QLymxYEDeE21vtLxGVm',
+ 'md5': '4f5cb382126c22d1aba8a939f9c49690',
+ 'info_dict': {
+ 'id': '46812957',
+ 'ext': 'mp4',
+ 'title': 'Jon, Lawrence, Neman chat about practice',
+ 'duration': 3571.517847,
+ 'timestamp': 1698933600,
+ 'upload_date': '20231102',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ props = traverse_obj(
+ get_element_html_by_id('app', webpage), ({extract_attributes}, 'data-page', {json.loads}, 'props'))
+ video_id = str(props['call']['id'])
+
+ return {
+ 'id': video_id,
+ 'formats': self._extract_m3u8_formats(props['call']['video_url'], video_id, 'mp4'),
+ **traverse_obj(props, {
+ 'title': ('head', 'title', {str}),
+ 'duration': ('duration', {float_or_none}),
+ 'timestamp': ('call', 'started_at', {parse_iso8601}),
+ }),
+ }
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 9d82515..2cfed0f 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -2105,22 +2105,6 @@ class GenericIE(InfoExtractor):
},
},
{
- 'note': 'JW Player embed with unicode-escape sequences in URL',
- 'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics',
- 'info_dict': {
- 'id': 'm',
- 'ext': 'mp4',
- 'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi',
- 'description': 'Mahler\'s ',
- 'uploader': 'www.medici.tv',
- 'age_limit': 0,
- 'thumbnail': r're:^https?://.+\.jpg',
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
'md5': 'e2f0a4c329f7986280b7328e24036d60',
'info_dict': {
diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py
index eb1dcf8..c6eca0c 100644
--- a/yt_dlp/extractor/gofile.py
+++ b/yt_dlp/extractor/gofile.py
@@ -58,21 +58,18 @@ class GofileIE(InfoExtractor):
return
account_data = self._download_json(
- 'https://api.gofile.io/createAccount', None, note='Getting a new guest account')
+ 'https://api.gofile.io/accounts', None, 'Getting a new guest account', data=b'{}')
self._TOKEN = account_data['data']['token']
self._set_cookie('.gofile.io', 'accountToken', self._TOKEN)
def _entries(self, file_id):
- query_params = {
- 'contentId': file_id,
- 'token': self._TOKEN,
- 'wt': '4fd6sg89d7s6', # From https://gofile.io/dist/js/alljs.js
- }
+ query_params = {'wt': '4fd6sg89d7s6'} # From https://gofile.io/dist/js/alljs.js
password = self.get_param('videopassword')
if password:
query_params['password'] = hashlib.sha256(password.encode('utf-8')).hexdigest()
files = self._download_json(
- 'https://api.gofile.io/getContent', file_id, note='Getting filelist', query=query_params)
+ f'https://api.gofile.io/contents/{file_id}', file_id, 'Getting filelist',
+ query=query_params, headers={'Authorization': f'Bearer {self._TOKEN}'})
status = files['status']
if status == 'error-passwordRequired':
@@ -82,7 +79,7 @@ class GofileIE(InfoExtractor):
raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True)
found_files = False
- for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values():
+ for file in (try_get(files, lambda x: x['data']['children'], dict) or {}).values():
file_type, file_format = file.get('mimetype').split('/', 1)
if file_type not in ('video', 'audio') and file_format != 'vnd.mts':
continue
diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py
index 74aad11..7a98e0f 100644
--- a/yt_dlp/extractor/goplay.py
+++ b/yt_dlp/extractor/goplay.py
@@ -1,6 +1,6 @@
import base64
import binascii
-import datetime
+import datetime as dt
import hashlib
import hmac
import json
@@ -422,7 +422,7 @@ class AwsIdp:
months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
- time_now = datetime.datetime.now(datetime.timezone.utc)
+ time_now = dt.datetime.now(dt.timezone.utc)
format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day)
time_string = time_now.strftime(format_string)
return time_string
diff --git a/yt_dlp/extractor/imgur.py b/yt_dlp/extractor/imgur.py
index 1fa0a2a..f32c116 100644
--- a/yt_dlp/extractor/imgur.py
+++ b/yt_dlp/extractor/imgur.py
@@ -76,6 +76,23 @@ class ImgurIE(ImgurBaseIE):
'thumbnail': 'https://i.imgur.com/jxBXAMCh.jpg',
'dislike_count': int,
},
+ }, {
+ # needs Accept header, ref: https://github.com/yt-dlp/yt-dlp/issues/9458
+ 'url': 'https://imgur.com/zV03bd5',
+ 'md5': '59df97884e8ba76143ff6b640a0e2904',
+ 'info_dict': {
+ 'id': 'zV03bd5',
+ 'ext': 'mp4',
+ 'title': 'Ive - Liz',
+ 'timestamp': 1710491255,
+ 'upload_date': '20240315',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'duration': 56.92,
+ 'comment_count': int,
+ 'release_timestamp': 1710491255,
+ 'release_date': '20240315',
+ },
}]
def _real_extract(self, url):
@@ -192,6 +209,7 @@ class ImgurIE(ImgurBaseIE):
'id': video_id,
'formats': formats,
'thumbnail': url_or_none(search('thumbnailUrl')),
+ 'http_headers': {'Accept': '*/*'},
}
diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py
index a592098..d7f0a2d 100644
--- a/yt_dlp/extractor/jiosaavn.py
+++ b/yt_dlp/extractor/jiosaavn.py
@@ -1,89 +1,125 @@
+import functools
+
from .common import InfoExtractor
from ..utils import (
+ format_field,
int_or_none,
js_to_json,
+ make_archive_id,
+ smuggle_url,
+ unsmuggle_url,
+ url_basename,
url_or_none,
urlencode_postdata,
- urljoin,
)
from ..utils.traversal import traverse_obj
class JioSaavnBaseIE(InfoExtractor):
- def _extract_initial_data(self, url, audio_id):
- webpage = self._download_webpage(url, audio_id)
+ _VALID_BITRATES = {'16', '32', '64', '128', '320'}
+
+ @functools.cached_property
+ def requested_bitrates(self):
+ requested_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn')
+ if invalid_bitrates := set(requested_bitrates) - self._VALID_BITRATES:
+ raise ValueError(
+ f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. '
+ + f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}')
+ return requested_bitrates
+
+ def _extract_formats(self, song_data):
+ for bitrate in self.requested_bitrates:
+ media_data = self._download_json(
+ 'https://www.jiosaavn.com/api.php', song_data['id'],
+ f'Downloading format info for {bitrate}',
+ fatal=False, data=urlencode_postdata({
+ '__call': 'song.generateAuthToken',
+ '_format': 'json',
+ 'bitrate': bitrate,
+ 'url': song_data['encrypted_media_url'],
+ }))
+ if not traverse_obj(media_data, ('auth_url', {url_or_none})):
+ self.report_warning(f'Unable to extract format info for {bitrate}')
+ continue
+ ext = media_data.get('type')
+ yield {
+ 'url': media_data['auth_url'],
+ 'ext': 'm4a' if ext == 'mp4' else ext,
+ 'format_id': bitrate,
+ 'abr': int(bitrate),
+ 'vcodec': 'none',
+ }
+
+ def _extract_song(self, song_data):
+ info = traverse_obj(song_data, {
+ 'id': ('id', {str}),
+ 'title': ('title', 'text', {str}),
+ 'album': ('album', 'text', {str}),
+ 'thumbnail': ('image', 0, {url_or_none}),
+ 'duration': ('duration', {int_or_none}),
+ 'view_count': ('play_count', {int_or_none}),
+ 'release_year': ('year', {int_or_none}),
+ 'artists': ('artists', lambda _, v: v['role'] == 'singer', 'name', {str}),
+ 'webpage_url': ('perma_url', {url_or_none}), # for song, playlist extraction
+ })
+ if not info.get('webpage_url'): # for album extraction / fallback
+ info['webpage_url'] = format_field(
+ song_data, [('title', 'action')], 'https://www.jiosaavn.com%s') or None
+ if webpage_url := info['webpage_url']:
+ info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, url_basename(webpage_url))]
+
+ return info
+
+ def _extract_initial_data(self, url, display_id):
+ webpage = self._download_webpage(url, display_id)
return self._search_json(
r'window\.__INITIAL_DATA__\s*=', webpage,
- 'init json', audio_id, transform_source=js_to_json)
+ 'initial data', display_id, transform_source=js_to_json)
class JioSaavnSongIE(JioSaavnBaseIE):
+ IE_NAME = 'jiosaavn:song'
_VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk',
'md5': '3b84396d15ed9e083c3106f1fa589c04',
'info_dict': {
- 'id': 'OQsEfQFVUXk',
- 'ext': 'mp4',
+ 'id': 'IcoLuefJ',
+ 'ext': 'm4a',
'title': 'Leja Re',
'album': 'Leja Re',
'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg',
'duration': 205,
'view_count': int,
'release_year': 2018,
+ 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'],
+ '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'],
},
}, {
'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU',
'only_matching': True,
}]
- _VALID_BITRATES = ('16', '32', '64', '128', '320')
-
def _real_extract(self, url):
- audio_id = self._match_id(url)
- extract_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn')
- if invalid_bitrates := [br for br in extract_bitrates if br not in self._VALID_BITRATES]:
- raise ValueError(
- f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. '
- + f'Valid bitrates are: {", ".join(self._VALID_BITRATES)}')
+ url, smuggled_data = unsmuggle_url(url)
+ song_data = traverse_obj(smuggled_data, ({
+ 'id': ('id', {str}),
+ 'encrypted_media_url': ('encrypted_media_url', {str}),
+ }))
- song_data = self._extract_initial_data(url, audio_id)['song']['song']
- formats = []
- for bitrate in extract_bitrates:
- media_data = self._download_json(
- 'https://www.jiosaavn.com/api.php', audio_id, f'Downloading format info for {bitrate}',
- fatal=False, data=urlencode_postdata({
- '__call': 'song.generateAuthToken',
- '_format': 'json',
- 'bitrate': bitrate,
- 'url': song_data['encrypted_media_url'],
- }))
- if not media_data.get('auth_url'):
- self.report_warning(f'Unable to extract format info for {bitrate}')
- continue
- formats.append({
- 'url': media_data['auth_url'],
- 'ext': media_data.get('type'),
- 'format_id': bitrate,
- 'abr': int(bitrate),
- 'vcodec': 'none',
- })
+ if 'id' in song_data and 'encrypted_media_url' in song_data:
+ result = {'id': song_data['id']}
+ else:
+ # only extract metadata if this is not a url_transparent result
+ song_data = self._extract_initial_data(url, self._match_id(url))['song']['song']
+ result = self._extract_song(song_data)
- return {
- 'id': audio_id,
- 'formats': formats,
- **traverse_obj(song_data, {
- 'title': ('title', 'text'),
- 'album': ('album', 'text'),
- 'thumbnail': ('image', 0, {url_or_none}),
- 'duration': ('duration', {int_or_none}),
- 'view_count': ('play_count', {int_or_none}),
- 'release_year': ('year', {int_or_none}),
- }),
- }
+ result['formats'] = list(self._extract_formats(song_data))
+ return result
class JioSaavnAlbumIE(JioSaavnBaseIE):
+ IE_NAME = 'jiosaavn:album'
_VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_',
@@ -94,12 +130,45 @@ class JioSaavnAlbumIE(JioSaavnBaseIE):
'playlist_count': 10,
}]
+ def _entries(self, playlist_data):
+ for song_data in traverse_obj(playlist_data, (
+ 'modules', lambda _, x: x['key'] == 'list', 'data', lambda _, v: v['title']['action'])):
+ song_info = self._extract_song(song_data)
+ # album song data is missing artists and release_year, need to re-extract metadata
+ yield self.url_result(song_info['webpage_url'], JioSaavnSongIE, **song_info)
+
def _real_extract(self, url):
- album_id = self._match_id(url)
- album_view = self._extract_initial_data(url, album_id)['albumView']
-
- return self.playlist_from_matches(
- traverse_obj(album_view, (
- 'modules', lambda _, x: x['key'] == 'list', 'data', ..., 'title', 'action', {str})),
- album_id, traverse_obj(album_view, ('album', 'title', 'text', {str})), ie=JioSaavnSongIE,
- getter=lambda x: urljoin('https://www.jiosaavn.com/', x))
+ display_id = self._match_id(url)
+ album_data = self._extract_initial_data(url, display_id)['albumView']
+
+ return self.playlist_result(
+ self._entries(album_data), display_id, traverse_obj(album_data, ('album', 'title', 'text', {str})))
+
+
+class JioSaavnPlaylistIE(JioSaavnBaseIE):
+ IE_NAME = 'jiosaavn:playlist'
+ _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__',
+ 'info_dict': {
+ 'id': 'LlJ8ZWT1ibN5084vKHRj2Q__',
+ 'title': 'Mood English',
+ },
+ 'playlist_mincount': 50,
+ }]
+
+ def _entries(self, playlist_data):
+ for song_data in traverse_obj(playlist_data, ('list', lambda _, v: v['perma_url'])):
+ song_info = self._extract_song(song_data)
+ url = smuggle_url(song_info['webpage_url'], {
+ 'id': song_data['id'],
+ 'encrypted_media_url': song_data['encrypted_media_url'],
+ })
+ yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ playlist_data = self._extract_initial_data(url, display_id)['playlist']['playlist']
+
+ return self.playlist_result(
+ self._entries(playlist_data), display_id, traverse_obj(playlist_data, ('title', 'text', {str})))
diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py
index 3bb28af..7a91d4a 100644
--- a/yt_dlp/extractor/joqrag.py
+++ b/yt_dlp/extractor/joqrag.py
@@ -1,4 +1,4 @@
-import datetime
+import datetime as dt
import urllib.parse
from .common import InfoExtractor
@@ -50,8 +50,8 @@ class JoqrAgIE(InfoExtractor):
def _extract_start_timestamp(self, video_id, is_live):
def extract_start_time_from(date_str):
- dt = datetime_from_str(date_str) + datetime.timedelta(hours=9)
- date = dt.strftime('%Y%m%d')
+ dt_ = datetime_from_str(date_str) + dt.timedelta(hours=9)
+ date = dt_.strftime('%Y%m%d')
start_time = self._search_regex(
r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})',
self._download_webpage(
@@ -60,7 +60,7 @@ class JoqrAgIE(InfoExtractor):
errnote=f'Failed to download program list of {date}') or '',
'start time', default=None)
if start_time:
- return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00')
+ return unified_timestamp(f'{dt_.strftime("%Y/%m/%d")} {start_time} +09:00')
return None
start_timestamp = extract_start_time_from('today')
@@ -80,14 +80,14 @@ class JoqrAgIE(InfoExtractor):
note='Downloading metadata', errnote='Failed to download metadata')
title = self._extract_metadata('Program_name', metadata)
- if title == '放送休止':
+ if not title or title == '放送休止':
formats = []
live_status = 'is_upcoming'
release_timestamp = self._extract_start_timestamp(video_id, False)
msg = 'This stream is not currently live'
if release_timestamp:
msg += (' and will start at '
- + datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
+ + dt.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S'))
self.raise_no_formats(msg, expected=True)
else:
m3u8_path = self._search_regex(
diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py
index d124372..889548f 100644
--- a/yt_dlp/extractor/kick.py
+++ b/yt_dlp/extractor/kick.py
@@ -13,7 +13,8 @@ from ..utils import (
class KickBaseIE(InfoExtractor):
def _real_initialize(self):
- self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False)
+ self._request_webpage(
+ HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False, impersonate=True)
xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN')
if not xsrf_token:
self.write_debug('kick.com did not set XSRF-TOKEN cookie')
@@ -25,7 +26,7 @@ class KickBaseIE(InfoExtractor):
def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs):
return self._download_json(
f'https://kick.com/api/v1/{path}', display_id, note=note,
- headers=merge_dicts(headers, self._API_HEADERS), **kwargs)
+ headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs)
class KickIE(KickBaseIE):
@@ -82,26 +83,27 @@ class KickIE(KickBaseIE):
class KickVODIE(KickBaseIE):
_VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
_TESTS = [{
- 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35',
- 'md5': '73691206a6a49db25c5aa1588e6538fc',
+ 'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3',
+ 'md5': '3870f94153e40e7121a6e46c068b70cb',
'info_dict': {
- 'id': '54244b5e-050a-4df4-a013-b2433dafbe35',
+ 'id': '58bac65b-e641-4476-a7ba-3707a35e60e3',
'ext': 'mp4',
- 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links',
- 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f',
- 'channel': 'kmack710',
- 'channel_id': '16278',
- 'uploader': 'Kmack710',
- 'uploader_id': '16412',
- 'upload_date': '20221206',
- 'timestamp': 1670318289,
- 'duration': 40104.0,
+ 'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠',
+ 'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d',
+ 'channel': 'jaredfps',
+ 'channel_id': '26608',
+ 'uploader': 'JaredFPS',
+ 'uploader_id': '26799',
+ 'upload_date': '20240402',
+ 'timestamp': 1712097108,
+ 'duration': 33859.0,
'thumbnail': r're:^https?://.*\.jpg',
- 'categories': ['Grand Theft Auto V'],
+ 'categories': ['Call of Duty: Warzone'],
},
'params': {
'skip_download': 'm3u8',
},
+ 'expected_warnings': [r'impersonation'],
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/leeco.py b/yt_dlp/extractor/leeco.py
index 85033b8..5d61a60 100644
--- a/yt_dlp/extractor/leeco.py
+++ b/yt_dlp/extractor/leeco.py
@@ -1,4 +1,4 @@
-import datetime
+import datetime as dt
import hashlib
import re
import time
@@ -185,7 +185,7 @@ class LeIE(InfoExtractor):
publish_time = parse_iso8601(self._html_search_regex(
r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
- delimiter=' ', timezone=datetime.timedelta(hours=8))
+ delimiter=' ', timezone=dt.timedelta(hours=8))
description = self._html_search_meta('description', page, fatal=False)
return {
diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py
index ad41c0e..e12f467 100644
--- a/yt_dlp/extractor/linkedin.py
+++ b/yt_dlp/extractor/linkedin.py
@@ -1,4 +1,4 @@
-from itertools import zip_longest
+import itertools
import re
from .common import InfoExtractor
@@ -156,7 +156,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE):
def json2srt(self, transcript_lines, duration=None):
srt_data = ''
- for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])):
+ for line, (line_dict, next_dict) in enumerate(itertools.zip_longest(transcript_lines, transcript_lines[1:])):
start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption']
end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1
srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time),
diff --git a/yt_dlp/extractor/loom.py b/yt_dlp/extractor/loom.py
new file mode 100644
index 0000000..1191aa1
--- /dev/null
+++ b/yt_dlp/extractor/loom.py
@@ -0,0 +1,461 @@
+import json
+import textwrap
+import urllib.parse
+import uuid
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ filter_dict,
+ get_first,
+ int_or_none,
+ parse_iso8601,
+ update_url,
+ url_or_none,
+ variadic,
+)
+from ..utils.traversal import traverse_obj
+
+
+class LoomIE(InfoExtractor):
+ IE_NAME = 'loom'
+ _VALID_URL = r'https?://(?:www\.)?loom\.com/(?:share|embed)/(?P<id>[\da-f]{32})'
+ _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=["\'](?P<url>{_VALID_URL})']
+ _TESTS = [{
+ # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, json subs only
+ 'url': 'https://www.loom.com/share/43d05f362f734614a2e81b4694a3a523',
+ 'md5': 'bfc2d7e9c2e0eb4813212230794b6f42',
+ 'info_dict': {
+ 'id': '43d05f362f734614a2e81b4694a3a523',
+ 'ext': 'mp4',
+ 'title': 'A Ruler for Windows - 28 March 2022',
+ 'uploader': 'wILLIAM PIP',
+ 'upload_date': '20220328',
+ 'timestamp': 1648454238,
+ 'duration': 27,
+ },
+ }, {
+ # webm raw-url, mp4 transcoded-url, cdn url == transcoded-url, no subs
+ 'url': 'https://www.loom.com/share/c43a642f815f4378b6f80a889bb73d8d',
+ 'md5': '70f529317be8cf880fcc2c649a531900',
+ 'info_dict': {
+ 'id': 'c43a642f815f4378b6f80a889bb73d8d',
+ 'ext': 'webm',
+ 'title': 'Lilah Nielsen Intro Video',
+ 'uploader': 'Lilah Nielsen',
+ 'upload_date': '20200826',
+ 'timestamp': 1598480716,
+ 'duration': 20,
+ },
+ }, {
+ # m3u8 raw-url, mp4 transcoded-url, cdn url == raw-url, vtt sub and json subs
+ 'url': 'https://www.loom.com/share/9458bcbf79784162aa62ffb8dd66201b',
+ 'md5': '51737ec002969dd28344db4d60b9cbbb',
+ 'info_dict': {
+ 'id': '9458bcbf79784162aa62ffb8dd66201b',
+ 'ext': 'mp4',
+ 'title': 'Sharing screen with gpt-4',
+ 'description': 'Sharing screen with GPT 4 vision model and asking questions to guide through blender.',
+ 'uploader': 'Suneel Matham',
+ 'chapters': 'count:3',
+ 'upload_date': '20231109',
+ 'timestamp': 1699518978,
+ 'duration': 93,
+ },
+ }, {
+ # mpd raw-url, mp4 transcoded-url, cdn url == raw-url, no subs
+ 'url': 'https://www.loom.com/share/24351eb8b317420289b158e4b7e96ff2',
+ 'info_dict': {
+ 'id': '24351eb8b317420289b158e4b7e96ff2',
+ 'ext': 'webm',
+ 'title': 'OMFG clown',
+ 'description': 'md5:285c5ee9d62aa087b7e3271b08796815',
+ 'uploader': 'MrPumkin B',
+ 'upload_date': '20210924',
+ 'timestamp': 1632519618,
+ 'duration': 210,
+ },
+ 'params': {'skip_download': 'dash'},
+ }, {
+ # password-protected
+ 'url': 'https://www.loom.com/share/50e26e8aeb7940189dff5630f95ce1f4',
+ 'md5': '5cc7655e7d55d281d203f8ffd14771f7',
+ 'info_dict': {
+ 'id': '50e26e8aeb7940189dff5630f95ce1f4',
+ 'ext': 'mp4',
+ 'title': 'iOS Mobile Upload',
+ 'uploader': 'Simon Curran',
+ 'upload_date': '20200520',
+ 'timestamp': 1590000123,
+ 'duration': 35,
+ },
+ 'params': {'videopassword': 'seniorinfants2'},
+ }, {
+ # embed, transcoded-url endpoint sends empty JSON response
+ 'url': 'https://www.loom.com/embed/ddcf1c1ad21f451ea7468b1e33917e4e',
+ 'md5': '8488817242a0db1cb2ad0ea522553cf6',
+ 'info_dict': {
+ 'id': 'ddcf1c1ad21f451ea7468b1e33917e4e',
+ 'ext': 'mp4',
+ 'title': 'CF Reset User\'s Password',
+ 'uploader': 'Aimee Heintz',
+ 'upload_date': '20220707',
+ 'timestamp': 1657216459,
+ 'duration': 181,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
+ }]
+ _WEBPAGE_TESTS = [{
+ 'url': 'https://www.loom.com/community/e1229802a8694a09909e8ba0fbb6d073-pg',
+ 'md5': 'ec838cd01b576cf0386f32e1ae424609',
+ 'info_dict': {
+ 'id': 'e1229802a8694a09909e8ba0fbb6d073',
+ 'ext': 'mp4',
+ 'title': 'Rexie Jane Cimafranca - Founder\'s Presentation',
+ 'uploader': 'Rexie Cimafranca',
+ 'upload_date': '20230213',
+ 'duration': 247,
+ 'timestamp': 1676274030,
+ },
+ }]
+
+ _GRAPHQL_VARIABLES = {
+ 'GetVideoSource': {
+ 'acceptableMimes': ['DASH', 'M3U8', 'MP4'],
+ },
+ }
+ _GRAPHQL_QUERIES = {
+ 'GetVideoSSR': textwrap.dedent('''\
+ query GetVideoSSR($videoId: ID!, $password: String) {
+ getVideo(id: $videoId, password: $password) {
+ __typename
+ ... on PrivateVideo {
+ id
+ status
+ message
+ __typename
+ }
+ ... on VideoPasswordMissingOrIncorrect {
+ id
+ message
+ __typename
+ }
+ ... on RegularUserVideo {
+ id
+ __typename
+ createdAt
+ description
+ download_enabled
+ folder_id
+ is_protected
+ needs_password
+ owner {
+ display_name
+ __typename
+ }
+ privacy
+ s3_id
+ name
+ video_properties {
+ avgBitRate
+ client
+ camera_enabled
+ client_version
+ duration
+ durationMs
+ format
+ height
+ microphone_enabled
+ os
+ os_version
+ recordingClient
+ recording_type
+ recording_version
+ screen_type
+ tab_audio
+ trim_duration
+ width
+ __typename
+ }
+ playable_duration
+ source_duration
+ visibility
+ }
+ }
+ }\n'''),
+ 'GetVideoSource': textwrap.dedent('''\
+ query GetVideoSource($videoId: ID!, $password: String, $acceptableMimes: [CloudfrontVideoAcceptableMime]) {
+ getVideo(id: $videoId, password: $password) {
+ ... on RegularUserVideo {
+ id
+ nullableRawCdnUrl(acceptableMimes: $acceptableMimes, password: $password) {
+ url
+ __typename
+ }
+ __typename
+ }
+ __typename
+ }
+ }\n'''),
+ 'FetchVideoTranscript': textwrap.dedent('''\
+ query FetchVideoTranscript($videoId: ID!, $password: String) {
+ fetchVideoTranscript(videoId: $videoId, password: $password) {
+ ... on VideoTranscriptDetails {
+ id
+ video_id
+ source_url
+ captions_source_url
+ __typename
+ }
+ ... on GenericError {
+ message
+ __typename
+ }
+ __typename
+ }
+ }\n'''),
+ 'FetchChapters': textwrap.dedent('''\
+ query FetchChapters($videoId: ID!, $password: String) {
+ fetchVideoChapters(videoId: $videoId, password: $password) {
+ ... on VideoChapters {
+ video_id
+ content
+ __typename
+ }
+ ... on EmptyChaptersPayload {
+ content
+ __typename
+ }
+ ... on InvalidRequestWarning {
+ message
+ __typename
+ }
+ ... on Error {
+ message
+ __typename
+ }
+ __typename
+ }
+ }\n'''),
+ }
+ _APOLLO_GRAPHQL_VERSION = '0a1856c'
+
+ def _call_graphql_api(self, operations, video_id, note=None, errnote=None):
+ password = self.get_param('videopassword')
+ return self._download_json(
+ 'https://www.loom.com/graphql', video_id, note or 'Downloading GraphQL JSON',
+ errnote or 'Failed to download GraphQL JSON', headers={
+ 'Accept': 'application/json',
+ 'Content-Type': 'application/json',
+ 'x-loom-request-source': f'loom_web_{self._APOLLO_GRAPHQL_VERSION}',
+ 'apollographql-client-name': 'web',
+ 'apollographql-client-version': self._APOLLO_GRAPHQL_VERSION,
+ }, data=json.dumps([{
+ 'operationName': operation_name,
+ 'variables': {
+ 'videoId': video_id,
+ 'password': password,
+ **self._GRAPHQL_VARIABLES.get(operation_name, {}),
+ },
+ 'query': self._GRAPHQL_QUERIES[operation_name],
+ } for operation_name in variadic(operations)], separators=(',', ':')).encode())
+
+ def _call_url_api(self, endpoint, video_id):
+ response = self._download_json(
+ f'https://www.loom.com/api/campaigns/sessions/{video_id}/{endpoint}', video_id,
+ f'Downloading {endpoint} JSON', f'Failed to download {endpoint} JSON', fatal=False,
+ headers={'Accept': 'application/json', 'Content-Type': 'application/json'},
+ data=json.dumps({
+ 'anonID': str(uuid.uuid4()),
+ 'deviceID': None,
+ 'force_original': False, # HTTP error 401 if True
+ 'password': self.get_param('videopassword'),
+ }, separators=(',', ':')).encode())
+ return traverse_obj(response, ('url', {url_or_none}))
+
+ def _extract_formats(self, video_id, metadata, gql_data):
+ formats = []
+ video_properties = traverse_obj(metadata, ('video_properties', {
+ 'width': ('width', {int_or_none}),
+ 'height': ('height', {int_or_none}),
+ 'acodec': ('microphone_enabled', {lambda x: 'none' if x is False else None}),
+ }))
+
+ def get_formats(format_url, format_id, quality):
+ if not format_url:
+ return
+ ext = determine_ext(format_url)
+ query = urllib.parse.urlparse(format_url).query
+
+ if ext == 'm3u8':
+ # Extract pre-merged HLS formats to avoid buggy parsing of metadata in split playlists
+ format_url = format_url.replace('-split.m3u8', '.m3u8')
+ m3u8_formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', m3u8_id=f'hls-{format_id}', fatal=False, quality=quality)
+ for fmt in m3u8_formats:
+ yield {
+ **fmt,
+ 'url': update_url(fmt['url'], query=query),
+ 'extra_param_to_segment_url': query,
+ }
+
+ elif ext == 'mpd':
+ dash_formats = self._extract_mpd_formats(
+ format_url, video_id, mpd_id=f'dash-{format_id}', fatal=False)
+ for fmt in dash_formats:
+ yield {
+ **fmt,
+ 'extra_param_to_segment_url': query,
+ 'quality': quality,
+ }
+
+ else:
+ yield {
+ 'url': format_url,
+ 'ext': ext,
+ 'format_id': f'http-{format_id}',
+ 'quality': quality,
+ **video_properties,
+ }
+
+ raw_url = self._call_url_api('raw-url', video_id)
+ formats.extend(get_formats(raw_url, 'raw', quality=1)) # original quality
+
+ transcoded_url = self._call_url_api('transcoded-url', video_id)
+ formats.extend(get_formats(transcoded_url, 'transcoded', quality=-1)) # transcoded quality
+
+ cdn_url = get_first(gql_data, ('data', 'getVideo', 'nullableRawCdnUrl', 'url', {url_or_none}))
+ # cdn_url is usually a dupe, but the raw-url/transcoded-url endpoints could return errors
+ valid_urls = [update_url(url, query=None) for url in (raw_url, transcoded_url) if url]
+ if cdn_url and update_url(cdn_url, query=None) not in valid_urls:
+ formats.extend(get_formats(cdn_url, 'cdn', quality=0)) # could be original or transcoded
+
+ return formats
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ metadata = get_first(
+ self._call_graphql_api('GetVideoSSR', video_id, 'Downloading GraphQL metadata JSON'),
+ ('data', 'getVideo', {dict})) or {}
+
+ if metadata.get('__typename') == 'VideoPasswordMissingOrIncorrect':
+ if not self.get_param('videopassword'):
+ raise ExtractorError(
+ 'This video is password-protected, use the --video-password option', expected=True)
+ raise ExtractorError('Invalid video password', expected=True)
+
+ gql_data = self._call_graphql_api(['FetchChapters', 'FetchVideoTranscript', 'GetVideoSource'], video_id)
+ duration = traverse_obj(metadata, ('video_properties', 'duration', {int_or_none}))
+
+ return {
+ 'id': video_id,
+ 'duration': duration,
+ 'chapters': self._extract_chapters_from_description(
+ get_first(gql_data, ('data', 'fetchVideoChapters', 'content', {str})), duration) or None,
+ 'formats': self._extract_formats(video_id, metadata, gql_data),
+ 'subtitles': filter_dict({
+ 'en': traverse_obj(gql_data, (
+ ..., 'data', 'fetchVideoTranscript',
+ ('source_url', 'captions_source_url'), {
+ 'url': {url_or_none},
+ })) or None,
+ }),
+ **traverse_obj(metadata, {
+ 'title': ('name', {str}),
+ 'description': ('description', {str}),
+ 'uploader': ('owner', 'display_name', {str}),
+ 'timestamp': ('createdAt', {parse_iso8601}),
+ }),
+ }
+
+
+class LoomFolderIE(InfoExtractor):
+ IE_NAME = 'loom:folder'
+ _VALID_URL = r'https?://(?:www\.)?loom\.com/share/folder/(?P<id>[\da-f]{32})'
+ _TESTS = [{
+ # 2 subfolders, no videos in root
+ 'url': 'https://www.loom.com/share/folder/997db4db046f43e5912f10dc5f817b5c',
+ 'playlist_mincount': 16,
+ 'info_dict': {
+ 'id': '997db4db046f43e5912f10dc5f817b5c',
+ 'title': 'Blending Lessons',
+ },
+ }, {
+ # only videos, no subfolders
+ 'url': 'https://www.loom.com/share/folder/9a8a87f6b6f546d9a400c8e7575ff7f2',
+ 'playlist_mincount': 12,
+ 'info_dict': {
+ 'id': '9a8a87f6b6f546d9a400c8e7575ff7f2',
+ 'title': 'List A- a, i, o',
+ },
+ }, {
+ # videos in root and empty subfolder
+ 'url': 'https://www.loom.com/share/folder/886e534218c24fd292e97e9563078cc4',
+ 'playlist_mincount': 21,
+ 'info_dict': {
+ 'id': '886e534218c24fd292e97e9563078cc4',
+ 'title': 'Medicare Agent Training videos',
+ },
+ }, {
+ # videos in root and videos in subfolders
+ 'url': 'https://www.loom.com/share/folder/b72c4ecdf04745da9403926d80a40c38',
+ 'playlist_mincount': 21,
+ 'info_dict': {
+ 'id': 'b72c4ecdf04745da9403926d80a40c38',
+ 'title': 'Quick Altos Q & A Tutorials',
+ },
+ }, {
+ # recursive folder extraction
+ 'url': 'https://www.loom.com/share/folder/8b458a94e0e4449b8df9ea7a68fafc4e',
+ 'playlist_count': 23,
+ 'info_dict': {
+ 'id': '8b458a94e0e4449b8df9ea7a68fafc4e',
+ 'title': 'Sezer Texting Guide',
+ },
+ }, {
+ # more than 50 videos in 1 folder
+ 'url': 'https://www.loom.com/share/folder/e056a91d290d47ca9b00c9d1df56c463',
+ 'playlist_mincount': 61,
+ 'info_dict': {
+ 'id': 'e056a91d290d47ca9b00c9d1df56c463',
+ 'title': 'User Videos',
+ },
+ }, {
+ # many subfolders
+ 'url': 'https://www.loom.com/share/folder/c2dde8cc67454f0e99031677279d8954',
+ 'playlist_mincount': 75,
+ 'info_dict': {
+ 'id': 'c2dde8cc67454f0e99031677279d8954',
+ 'title': 'Honors 1',
+ },
+ }, {
+ 'url': 'https://www.loom.com/share/folder/bae17109a68146c7803454f2893c8cf8/Edpuzzle',
+ 'only_matching': True,
+ }]
+
+ def _extract_folder_data(self, folder_id):
+ return self._download_json(
+ f'https://www.loom.com/v1/folders/{folder_id}', folder_id,
+ 'Downloading folder info JSON', query={'limit': '10000'})
+
+ def _extract_folder_entries(self, folder_id, initial_folder_data=None):
+ folder_data = initial_folder_data or self._extract_folder_data(folder_id)
+
+ for video in traverse_obj(folder_data, ('videos', lambda _, v: v['id'])):
+ video_id = video['id']
+ yield self.url_result(
+ f'https://www.loom.com/share/{video_id}', LoomIE, video_id, video.get('name'))
+
+ # Recurse into subfolders
+ for subfolder_id in traverse_obj(folder_data, (
+ 'folders', lambda _, v: v['id'] != folder_id, 'id', {str})):
+ yield from self._extract_folder_entries(subfolder_id)
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ playlist_data = self._extract_folder_data(playlist_id)
+
+ return self.playlist_result(
+ self._extract_folder_entries(playlist_id, playlist_data), playlist_id,
+ traverse_obj(playlist_data, ('folder', 'name', {str.strip})))
diff --git a/yt_dlp/extractor/masters.py b/yt_dlp/extractor/masters.py
index 716f1c9..c3c58d7 100644
--- a/yt_dlp/extractor/masters.py
+++ b/yt_dlp/extractor/masters.py
@@ -1,4 +1,3 @@
-from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
traverse_obj,
diff --git a/yt_dlp/extractor/medici.py b/yt_dlp/extractor/medici.py
index 328ccd2..b6235b6 100644
--- a/yt_dlp/extractor/medici.py
+++ b/yt_dlp/extractor/medici.py
@@ -1,67 +1,153 @@
+import urllib.parse
+
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
- update_url_query,
- urlencode_postdata,
+ filter_dict,
+ parse_iso8601,
+ traverse_obj,
+ try_call,
+ url_or_none,
)
class MediciIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)'
- _TEST = {
- 'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp',
- 'md5': '004c21bb0a57248085b6ff3fec72719d',
+ _VALID_URL = r'https?://(?:(?P<sub>www|edu)\.)?medici\.tv/[a-z]{2}/[\w.-]+/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.medici.tv/en/operas/thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris',
+ 'md5': 'd483f74e7a7a9eac0dbe152ab189050d',
+ 'info_dict': {
+ 'id': '8032',
+ 'ext': 'mp4',
+ 'title': 'Thomas Adès\'s The Exterminating Angel',
+ 'description': 'md5:708ae6350dadc604225b4a6e32482bab',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'upload_date': '20240304',
+ 'timestamp': 1709561766,
+ 'display_id': 'thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris',
+ },
+ 'expected_warnings': [r'preview'],
+ }, {
+ 'url': 'https://edu.medici.tv/en/operas/wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum',
+ 'md5': '4ef3f4079a6e1c617584463a9eb84f99',
+ 'info_dict': {
+ 'id': '7900',
+ 'ext': 'mp4',
+ 'title': 'Wagner\'s Lohengrin',
+ 'description': 'md5:a384a62937866101f86902f21752cd89',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'upload_date': '20231017',
+ 'timestamp': 1697554771,
+ 'display_id': 'wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum',
+ },
+ 'expected_warnings': [r'preview'],
+ }, {
+ 'url': 'https://www.medici.tv/en/concerts/sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello',
+ 'md5': '9dd757e53b22b2511e85ea9ea60e4815',
+ 'info_dict': {
+ 'id': '5712',
+ 'ext': 'mp4',
+ 'title': 'Sergey Smbatyan conducts Tigran Mansurian — With Chouchane Siranossian and Mario Brunello',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'description': 'md5:9411fe44c874bb10e9af288c65816e41',
+ 'upload_date': '20200323',
+ 'timestamp': 1584975600,
+ 'display_id': 'sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello',
+ },
+ 'expected_warnings': [r'preview'],
+ }, {
+ 'url': 'https://www.medici.tv/en/ballets/carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma',
+ 'md5': '40f5e76cb701a97a6d7ba23b62c49990',
+ 'info_dict': {
+ 'id': '7857',
+ 'ext': 'mp4',
+ 'title': 'Carmen by Jiří Bubeníček after Roland Petit, music by Bizet, de Falla, Castelnuovo-Tedesco, and Bonolis',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'description': 'md5:0f15a15611ed748020c769873e10a8bb',
+ 'upload_date': '20240223',
+ 'timestamp': 1708707600,
+ 'display_id': 'carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma',
+ },
+ 'expected_warnings': [r'preview'],
+ }, {
+ 'url': 'https://www.medici.tv/en/documentaries/la-sonnambula-liege-2023-documentaire',
+ 'md5': '87ff198018ce79a34757ab0dd6f21080',
+ 'info_dict': {
+ 'id': '7513',
+ 'ext': 'mp4',
+ 'title': 'La Sonnambula',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'description': 'md5:0caf9109a860fd50cd018df062a67f34',
+ 'upload_date': '20231103',
+ 'timestamp': 1699010830,
+ 'display_id': 'la-sonnambula-liege-2023-documentaire',
+ },
+ 'expected_warnings': [r'preview'],
+ }, {
+ 'url': 'https://edu.medici.tv/en/masterclasses/yvonne-loriod-olivier-messiaen',
+ 'md5': 'fb5dcec46d76ad20fbdbaabb01da191d',
+ 'info_dict': {
+ 'id': '3024',
+ 'ext': 'mp4',
+ 'title': 'Olivier Messiaen and Yvonne Loriod, pianists and teachers',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'description': 'md5:aab948e2f7690214b5c28896c83f1fc1',
+ 'upload_date': '20150223',
+ 'timestamp': 1424706608,
+ 'display_id': 'yvonne-loriod-olivier-messiaen',
+ },
+ 'skip': 'Requires authentication; preview starts in the middle',
+ }, {
+ 'url': 'https://www.medici.tv/en/jazz/makaya-mccraven-la-rochelle',
+ 'md5': '4cc279a8b06609782747c8f50beea2b3',
'info_dict': {
- 'id': '3059',
- 'ext': 'flv',
- 'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson',
- 'description': 'md5:322a1e952bafb725174fd8c1a8212f58',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'upload_date': '20170408',
+ 'id': '7922',
+ 'ext': 'mp4',
+ 'title': 'NEW: Makaya McCraven in La Rochelle',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'description': 'md5:b5a8aaeb6993d8ccb18bde8abb8aa8d2',
+ 'upload_date': '20231228',
+ 'timestamp': 1703754863,
+ 'display_id': 'makaya-mccraven-la-rochelle',
},
- }
+ 'expected_warnings': [r'preview'],
+ }]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- # Sets csrftoken cookie
- self._download_webpage(url, video_id)
+ display_id, subdomain = self._match_valid_url(url).group('id', 'sub')
+ self._request_webpage(url, display_id, 'Requesting CSRF token cookie')
- MEDICI_URL = 'http://www.medici.tv/'
+ subdomain = 'edu-' if subdomain == 'edu' else ''
+ origin = f'https://{urllib.parse.urlparse(url).hostname}'
data = self._download_json(
- MEDICI_URL, video_id,
- data=urlencode_postdata({
- 'json': 'true',
- 'page': '/%s' % video_id,
- 'timezone_offset': -420,
- }), headers={
- 'X-CSRFToken': self._get_cookies(url)['csrftoken'].value,
- 'X-Requested-With': 'XMLHttpRequest',
- 'Referer': MEDICI_URL,
- 'Content-Type': 'application/x-www-form-urlencoded',
- })
-
- video = data['video']['videos']['video1']
-
- title = video.get('nom') or data['title']
+ f'https://api.medici.tv/{subdomain}satie/edito/movie-file/{display_id}/', display_id,
+ headers=filter_dict({
+ 'Authorization': try_call(
+ lambda: urllib.parse.unquote(self._get_cookies(url)['auth._token.mAuth'].value)),
+ 'Device-Type': 'web',
+ 'Origin': origin,
+ 'Referer': f'{origin}/',
+ 'Accept': 'application/json, text/plain, */*',
+ }))
- video_id = video.get('id') or video_id
- formats = self._extract_f4m_formats(
- update_url_query(video['url_akamai'], {
- 'hdcore': '3.1.0',
- 'plugin=aasp': '3.1.0.43.124',
- }), video_id, f4m_id='hds')
+ if not traverse_obj(data, ('video', 'is_full_video')) and traverse_obj(
+ data, ('video', 'is_limited_by_user_access')):
+ self.report_warning(
+ 'The full video is for subscribers only. Only previews will be downloaded. If you '
+ 'have used the --cookies-from-browser option, try using the --cookies option instead')
- description = data.get('meta_description')
- thumbnail = video.get('url_thumbnail') or data.get('main_image')
- upload_date = unified_strdate(data['video'].get('date'))
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ data['video']['video_url'], display_id, 'mp4')
return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
+ 'id': str(data['id']),
+ 'display_id': display_id,
'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('subtitle', {str}),
+ 'thumbnail': ('picture', {url_or_none}),
+ 'timestamp': ('date_publish', {parse_iso8601}),
+ }),
}
diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py
index 9b50996..5f5f160 100644
--- a/yt_dlp/extractor/microsoftstream.py
+++ b/yt_dlp/extractor/microsoftstream.py
@@ -1,4 +1,4 @@
-from base64 import b64decode
+import base64
from .common import InfoExtractor
from ..utils import (
@@ -81,7 +81,7 @@ class MicrosoftStreamIE(InfoExtractor):
'url': thumbnail_url,
}
thumb_name = url_basename(thumbnail_url)
- thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
+ thumb_name = str(base64.b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
thumb.update(parse_resolution(thumb_name))
thumbnails.append(thumb)
diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py
index 4be6947..b980fd0 100644
--- a/yt_dlp/extractor/mixch.py
+++ b/yt_dlp/extractor/mixch.py
@@ -1,5 +1,7 @@
from .common import InfoExtractor
-from ..utils import UserNotLive, traverse_obj
+from ..networking.exceptions import HTTPError
+from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none
+from ..utils.traversal import traverse_obj
class MixchIE(InfoExtractor):
@@ -25,25 +27,23 @@ class MixchIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id)
-
- initial_js_state = self._parse_json(self._search_regex(
- r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id)
- if not initial_js_state.get('liveInfo'):
+ data = self._download_json(f'https://mixch.tv/api-web/users/{video_id}/live', video_id)
+ if not traverse_obj(data, ('liveInfo', {dict})):
raise UserNotLive(video_id=video_id)
return {
'id': video_id,
- 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')),
- 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')),
- 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')),
- 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')),
- 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')),
'uploader_id': video_id,
+ **traverse_obj(data, {
+ 'title': ('liveInfo', 'title', {str}),
+ 'comment_count': ('liveInfo', 'comments', {int_or_none}),
+ 'view_count': ('liveInfo', 'visitor', {int_or_none}),
+ 'timestamp': ('liveInfo', 'created', {int_or_none}),
+ 'uploader': ('broadcasterInfo', 'name', {str}),
+ }),
'formats': [{
'format_id': 'hls',
- 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls'))
- or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'),
+ 'url': data['liveInfo']['hls'],
'ext': 'mp4',
'protocol': 'm3u8',
}],
@@ -60,22 +60,38 @@ class MixchArchiveIE(InfoExtractor):
'skip': 'paid video, no DRM. expires at Jan 23',
'info_dict': {
'id': '421',
+ 'ext': 'mp4',
'title': '96NEKO SHOW TIME',
}
+ }, {
+ 'url': 'https://mixch.tv/archive/1213',
+ 'skip': 'paid video, no DRM. expires at Dec 31, 2023',
+ 'info_dict': {
+ 'id': '1213',
+ 'ext': 'mp4',
+ 'title': '【特別トーク番組アーカイブス】Merm4id×燐舞曲 2nd LIVE「VERSUS」',
+ 'release_date': '20231201',
+ 'thumbnail': str,
+ }
+ }, {
+ 'url': 'https://mixch.tv/archive/1214',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- html5_videos = self._parse_html5_media_entries(
- url, webpage.replace('video-js', 'video'), video_id, 'hls')
- if not html5_videos:
- self.raise_login_required(method='cookies')
- infodict = html5_videos[0]
- infodict.update({
- 'id': video_id,
- 'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title')
- })
+ try:
+ info_json = self._download_json(
+ f'https://mixch.tv/api-web/archive/{video_id}', video_id)['archive']
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 401:
+ self.raise_login_required()
+ raise
- return infodict
+ return {
+ 'id': video_id,
+ 'title': traverse_obj(info_json, ('title', {str})),
+ 'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id),
+ 'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})),
+ }
diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py
index 160150a..b6c18fe 100644
--- a/yt_dlp/extractor/motherless.py
+++ b/yt_dlp/extractor/motherless.py
@@ -1,4 +1,4 @@
-import datetime
+import datetime as dt
import re
import urllib.parse
@@ -151,7 +151,7 @@ class MotherlessIE(InfoExtractor):
'd': 'days',
}
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
- upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
+ upload_date = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
uploader_id = self._html_search_regex(
diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py
index 806b790..885557e 100644
--- a/yt_dlp/extractor/naver.py
+++ b/yt_dlp/extractor/naver.py
@@ -4,8 +4,8 @@ import hmac
import itertools
import json
import re
+import urllib.parse
import time
-from urllib.parse import parse_qs, urlparse
from .common import InfoExtractor
from ..utils import (
@@ -388,7 +388,7 @@ class NaverNowIE(NaverBaseIE):
def _real_extract(self, url):
show_id = self._match_id(url)
- qs = parse_qs(urlparse(url).query)
+ qs = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
if not self._yes_playlist(show_id, qs.get('shareHightlight')):
return self._extract_highlight(show_id, qs['shareHightlight'][0])
diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py
index d332b84..73b33a9 100644
--- a/yt_dlp/extractor/neteasemusic.py
+++ b/yt_dlp/extractor/neteasemusic.py
@@ -1,9 +1,9 @@
+import hashlib
import itertools
import json
+import random
import re
import time
-from hashlib import md5
-from random import randint
from .common import InfoExtractor
from ..aes import aes_ecb_encrypt, pkcs7_padding
@@ -34,7 +34,7 @@ class NetEaseMusicBaseIE(InfoExtractor):
request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':'))
message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1')
- msg_digest = md5(message).hexdigest()
+ msg_digest = hashlib.md5(message).hexdigest()
data = pkcs7_padding(list(str.encode(
f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}')))
@@ -53,7 +53,7 @@ class NetEaseMusicBaseIE(InfoExtractor):
'__csrf': '',
'os': 'pc',
'channel': 'undefined',
- 'requestId': f'{int(time.time() * 1000)}_{randint(0, 1000):04}',
+ 'requestId': f'{int(time.time() * 1000)}_{random.randint(0, 1000):04}',
**traverse_obj(self._get_cookies(self._API_BASE), {
'MUSIC_U': ('MUSIC_U', {lambda i: i.value}),
})
diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py
index 7cf5b24..8bb017a 100644
--- a/yt_dlp/extractor/nhk.py
+++ b/yt_dlp/extractor/nhk.py
@@ -8,6 +8,7 @@ from ..utils import (
int_or_none,
join_nonempty,
parse_duration,
+ remove_end,
traverse_obj,
try_call,
unescapeHTML,
@@ -19,8 +20,7 @@ from ..utils import (
class NhkBaseIE(InfoExtractor):
_API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json'
- _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand'
- _TYPE_REGEX = r'/(?P<type>video|audio)/'
+ _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/'
def _call_api(self, m_id, lang, is_video, is_episode, is_clip):
return self._download_json(
@@ -83,7 +83,7 @@ class NhkBaseIE(InfoExtractor):
def _extract_episode_info(self, url, episode=None):
fetch_episode = episode is None
lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id')
- is_video = m_type == 'video'
+ is_video = m_type != 'audio'
if is_video:
episode_id = episode_id[:4] + '-' + episode_id[4:]
@@ -138,9 +138,10 @@ class NhkBaseIE(InfoExtractor):
else:
if fetch_episode:
- audio_path = episode['audio']['audio']
+ # From https://www3.nhk.or.jp/nhkworld/common/player/radio/inline/rod.html
+ audio_path = remove_end(episode['audio']['audio'], '.m4a')
info['formats'] = self._extract_m3u8_formats(
- 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
+ f'{urljoin("https://vod-stream.nhk.jp", audio_path)}/index.m3u8',
episode_id, 'm4a', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)
for f in info['formats']:
@@ -155,9 +156,11 @@ class NhkBaseIE(InfoExtractor):
class NhkVodIE(NhkBaseIE):
- # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg
- _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>video)/(?P<id>[0-9a-z]+)',
- rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[0-9a-z]+)']
+ _VALID_URL = [
+ rf'{NhkBaseIE._BASE_URL_REGEX}shows/(?:(?P<type>video)/)?(?P<id>\d{{4}}[\da-z]\d+)/?(?:$|[?#])',
+ rf'{NhkBaseIE._BASE_URL_REGEX}(?:ondemand|shows)/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[\da-z]+)',
+ rf'{NhkBaseIE._BASE_URL_REGEX}ondemand/(?P<type>video)/(?P<id>\d{{4}}[\da-z]\d+)', # deprecated
+ ]
# Content available only for a limited period of time. Visit
# https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
_TESTS = [{
@@ -167,17 +170,16 @@ class NhkVodIE(NhkBaseIE):
'ext': 'mp4',
'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead',
'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6',
- 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463',
+ 'thumbnail': r're:https://.+/.+\.jpg',
'episode': 'The Tohoku Shinkansen: Full Speed Ahead',
'series': 'Japan Railway Journal',
- 'modified_timestamp': 1694243656,
+ 'modified_timestamp': 1707217907,
'timestamp': 1681428600,
'release_timestamp': 1693883728,
'duration': 1679,
'upload_date': '20230413',
- 'modified_date': '20230909',
+ 'modified_date': '20240206',
'release_date': '20230905',
-
},
}, {
# video clip
@@ -188,15 +190,15 @@ class NhkVodIE(NhkBaseIE):
'ext': 'mp4',
'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU',
'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
- 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed',
+ 'thumbnail': r're:https://.+/.+\.jpg',
'series': 'Dining with the Chef',
'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
'duration': 148,
'upload_date': '20190816',
'release_date': '20230902',
'release_timestamp': 1693619292,
- 'modified_timestamp': 1694168033,
- 'modified_date': '20230908',
+ 'modified_timestamp': 1707217907,
+ 'modified_date': '20240206',
'timestamp': 1565997540,
},
}, {
@@ -208,7 +210,7 @@ class NhkVodIE(NhkBaseIE):
'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines',
'series': 'Living in Japan',
'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab',
- 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545',
+ 'thumbnail': r're:https://.+/.+\.jpg',
'episode': 'Tips for Travelers to Japan / Ramen Vending Machines'
},
}, {
@@ -245,7 +247,7 @@ class NhkVodIE(NhkBaseIE):
'title': 'おはよう日本(7時台) - 10月8日放送',
'series': 'おはよう日本(7時台)',
'episode': '10月8日放送',
- 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4',
+ 'thumbnail': r're:https://.+/.+\.jpg',
'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0',
},
'skip': 'expires 2023-10-15',
@@ -255,17 +257,100 @@ class NhkVodIE(NhkBaseIE):
'info_dict': {
'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552',
'ext': 'mp4',
- 'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island',
+ 'title': 'Barakan Discovers - AMAMI OSHIMA: Isson\'s Treasure Isla',
'description': 'md5:5db620c46a0698451cc59add8816b797',
- 'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd',
+ 'thumbnail': r're:https://.+/.+\.jpg',
'release_date': '20230905',
'timestamp': 1690103400,
'duration': 2939,
'release_timestamp': 1693898699,
- 'modified_timestamp': 1698057495,
- 'modified_date': '20231023',
'upload_date': '20230723',
+ 'modified_timestamp': 1707217907,
+ 'modified_date': '20240206',
+ 'episode': 'AMAMI OSHIMA: Isson\'s Treasure Isla',
+ 'series': 'Barakan Discovers',
+ },
+ }, {
+ # /ondemand/video/ url with alphabetical character in 5th position of id
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a07/',
+ 'info_dict': {
+ 'id': 'nw_c_en_9999-a07',
+ 'ext': 'mp4',
+ 'episode': 'Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]',
+ 'series': 'Mini-Dramas on SDGs',
+ 'modified_date': '20240206',
+ 'title': 'Mini-Dramas on SDGs - Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]',
+ 'description': 'md5:3f9dcb4db22fceb675d90448a040d3f6',
+ 'timestamp': 1621962360,
+ 'duration': 189,
+ 'release_date': '20230903',
+ 'modified_timestamp': 1707217907,
+ 'upload_date': '20210525',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'release_timestamp': 1693713487,
+ },
+ }, {
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999d17/',
+ 'info_dict': {
+ 'id': 'nw_c_en_9999-d17',
+ 'ext': 'mp4',
+ 'title': 'Flowers of snow blossom - The 72 Pentads of Yamato',
+ 'description': 'Today’s focus: Snow',
+ 'release_timestamp': 1693792402,
+ 'release_date': '20230904',
+ 'upload_date': '20220128',
+ 'timestamp': 1643370960,
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'duration': 136,
+ 'series': '',
+ 'modified_date': '20240206',
+ 'modified_timestamp': 1707217907,
+ },
+ }, {
+ # new /shows/ url format
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/2032307/',
+ 'info_dict': {
+ 'id': 'nw_vod_v_en_2032_307_20240321113000_01_1710990282',
+ 'ext': 'mp4',
+ 'title': 'Japanology Plus - 20th Anniversary Special Part 1',
+ 'description': 'md5:817d41fc8e54339ad2a916161ea24faf',
+ 'episode': '20th Anniversary Special Part 1',
+ 'series': 'Japanology Plus',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'duration': 1680,
+ 'timestamp': 1711020600,
+ 'upload_date': '20240321',
+ 'release_timestamp': 1711022683,
+ 'release_date': '20240321',
+ 'modified_timestamp': 1711031012,
+ 'modified_date': '20240321',
+ },
+ }, {
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/3020025/',
+ 'info_dict': {
+ 'id': 'nw_vod_v_en_3020_025_20230325144000_01_1679723944',
+ 'ext': 'mp4',
+ 'title': '100 Ideas to Save the World - Working Styles Evolve',
+ 'description': 'md5:9e6c7778eaaf4f7b4af83569649f84d9',
+ 'episode': 'Working Styles Evolve',
+ 'series': '100 Ideas to Save the World',
+ 'thumbnail': r're:https://.+/.+\.jpg',
+ 'duration': 899,
+ 'upload_date': '20230325',
+ 'timestamp': 1679755200,
+ 'release_date': '20230905',
+ 'release_timestamp': 1693880540,
+ 'modified_date': '20240206',
+ 'modified_timestamp': 1707217907,
},
+ }, {
+ # new /shows/audio/ url format
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/livinginjapan-20231001-1/',
+ 'only_matching': True,
+ }, {
+ # valid url even if can't be found in wild; support needed for clip entries extraction
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/9999o80/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -273,18 +358,21 @@ class NhkVodIE(NhkBaseIE):
class NhkVodProgramIE(NhkBaseIE):
- _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P<id>\w+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?'
+ _VALID_URL = rf'''(?x)
+ {NhkBaseIE._BASE_URL_REGEX}(?:shows|tv)/
+ (?:(?P<type>audio)/programs/)?(?P<id>\w+)/?
+ (?:\?(?:[^#]+&)?type=(?P<episode_type>clip|(?:radio|tv)Episode))?'''
_TESTS = [{
# video program episodes
- 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo',
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/sumo/',
'info_dict': {
'id': 'sumo',
'title': 'GRAND SUMO Highlights',
'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf',
},
- 'playlist_mincount': 0,
+ 'playlist_mincount': 1,
}, {
- 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/',
'info_dict': {
'id': 'japanrailway',
'title': 'Japan Railway Journal',
@@ -293,40 +381,68 @@ class NhkVodProgramIE(NhkBaseIE):
'playlist_mincount': 12,
}, {
# video program clips
- 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip',
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/?type=clip',
'info_dict': {
'id': 'japanrailway',
'title': 'Japan Railway Journal',
'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f',
},
- 'playlist_mincount': 5,
- }, {
- 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/',
- 'only_matching': True,
+ 'playlist_mincount': 12,
}, {
# audio program
- 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/',
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/programs/livinginjapan/',
+ 'info_dict': {
+ 'id': 'livinginjapan',
+ 'title': 'Living in Japan',
+ 'description': 'md5:665bb36ec2a12c5a7f598ee713fc2b54',
+ },
+ 'playlist_mincount': 12,
+ }, {
+ # /tv/ program url
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/tv/designtalksplus/',
+ 'info_dict': {
+ 'id': 'designtalksplus',
+ 'title': 'DESIGN TALKS plus',
+ 'description': 'md5:47b3b3a9f10d4ac7b33b53b70a7d2837',
+ },
+ 'playlist_mincount': 20,
+ }, {
+ 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/10yearshayaomiyazaki/',
'only_matching': True,
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if NhkVodIE.suitable(url) else super().suitable(url)
+
+ def _extract_meta_from_class_elements(self, class_values, html):
+ for class_value in class_values:
+ if value := clean_html(get_element_by_class(class_value, html)):
+ return value
+
def _real_extract(self, url):
lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type')
episodes = self._call_api(
- program_id, lang, m_type == 'video', False, episode_type == 'clip')
+ program_id, lang, m_type != 'audio', False, episode_type == 'clip')
- entries = []
- for episode in episodes:
- episode_path = episode.get('url')
- if not episode_path:
- continue
- entries.append(self._extract_episode_info(
- urljoin(url, episode_path), episode))
+ def entries():
+ for episode in episodes:
+ if episode_path := episode.get('url'):
+ yield self._extract_episode_info(urljoin(url, episode_path), episode)
html = self._download_webpage(url, program_id)
- program_title = clean_html(get_element_by_class('p-programDetail__title', html))
- program_description = clean_html(get_element_by_class('p-programDetail__text', html))
-
- return self.playlist_result(entries, program_id, program_title, program_description)
+ program_title = self._extract_meta_from_class_elements([
+ 'p-programDetail__title', # /ondemand/program/
+ 'pProgramHero__logoText', # /shows/
+ 'tAudioProgramMain__title', # /shows/audio/programs/
+ 'p-program-name'], html) # /tv/
+ program_description = self._extract_meta_from_class_elements([
+ 'p-programDetail__text', # /ondemand/program/
+ 'pProgramHero__description', # /shows/
+ 'tAudioProgramMain__info', # /shows/audio/programs/
+ 'p-program-description'], html) # /tv/
+
+ return self.playlist_result(entries(), program_id, program_title, program_description)
class NhkForSchoolBangumiIE(InfoExtractor):
diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py
index 6a46246..b04ce96 100644
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@@ -1,11 +1,10 @@
-import datetime
+import datetime as dt
import functools
import itertools
import json
import re
import time
-
-from urllib.parse import urlparse
+import urllib.parse
from .common import InfoExtractor, SearchInfoExtractor
from ..networking import Request
@@ -820,12 +819,12 @@ class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor):
'playlist_mincount': 1610,
}]
- _START_DATE = datetime.date(2007, 1, 1)
+ _START_DATE = dt.date(2007, 1, 1)
_RESULTS_PER_PAGE = 32
_MAX_PAGES = 50
def _entries(self, url, item_id, start_date=None, end_date=None):
- start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date()
+ start_date, end_date = start_date or self._START_DATE, end_date or dt.datetime.now().date()
# If the last page has a full page of videos, we need to break down the query interval further
last_page_len = len(list(self._get_entries_for_date(
@@ -957,7 +956,7 @@ class NiconicoLiveIE(InfoExtractor):
'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
})
- hostname = remove_start(urlparse(urlh.url).hostname, 'sp.')
+ hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.')
latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
if latency not in self._KNOWN_LATENCY:
latency = 'high'
diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py
index ddea32d..63c5fd6 100644
--- a/yt_dlp/extractor/panopto.py
+++ b/yt_dlp/extractor/panopto.py
@@ -1,8 +1,8 @@
import calendar
-import json
+import datetime as dt
import functools
-from datetime import datetime, timezone
-from random import random
+import json
+import random
from .common import InfoExtractor
from ..compat import (
@@ -243,7 +243,7 @@ class PanoptoIE(PanoptoBaseIE):
invocation_id = delivery_info.get('InvocationId')
stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str)
if invocation_id and stream_id and duration:
- timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/'
+ timestamp_str = f'/Date({calendar.timegm(dt.datetime.now(dt.timezone.utc).timetuple())}000)/'
data = {
'streamRequests': [
{
@@ -415,7 +415,7 @@ class PanoptoIE(PanoptoBaseIE):
'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), expected_type=lambda x: x or None),
'timestamp': session_start_time - 11640000000 if session_start_time else None,
'duration': delivery.get('Duration'),
- 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}',
+ 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random.random()}',
'average_rating': delivery.get('AverageRating'),
'chapters': self._extract_chapters(timestamps),
'uploader': delivery.get('OwnerDisplayName') or None,
diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py
index d2ddb72..d4f822f 100644
--- a/yt_dlp/extractor/patreon.py
+++ b/yt_dlp/extractor/patreon.py
@@ -92,7 +92,7 @@ class PatreonIE(PatreonBaseIE):
'thumbnail': 're:^https?://.*$',
'upload_date': '20150211',
'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364',
- 'uploader_id': 'TraciJHines',
+ 'uploader_id': '@TraciHinesMusic',
'categories': ['Entertainment'],
'duration': 282,
'view_count': int,
@@ -106,8 +106,10 @@ class PatreonIE(PatreonBaseIE):
'availability': 'public',
'channel_follower_count': int,
'playable_in_embed': True,
- 'uploader_url': 'http://www.youtube.com/user/TraciJHines',
+ 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic',
'comment_count': int,
+ 'channel_is_verified': True,
+ 'chapters': 'count:4',
},
'params': {
'noplaylist': True,
@@ -176,6 +178,27 @@ class PatreonIE(PatreonBaseIE):
'uploader_url': 'https://www.patreon.com/thenormies',
},
'skip': 'Patron-only content',
+ }, {
+ # dead vimeo and embed URLs, need to extract post_file
+ 'url': 'https://www.patreon.com/posts/hunter-x-hunter-34007913',
+ 'info_dict': {
+ 'id': '34007913',
+ 'ext': 'mp4',
+ 'title': 'Hunter x Hunter | Kurapika DESTROYS Uvogin!!!',
+ 'like_count': int,
+ 'uploader': 'YaBoyRoshi',
+ 'timestamp': 1581636833,
+ 'channel_url': 'https://www.patreon.com/yaboyroshi',
+ 'thumbnail': r're:^https?://.*$',
+ 'tags': ['Hunter x Hunter'],
+ 'uploader_id': '14264111',
+ 'comment_count': int,
+ 'channel_follower_count': int,
+ 'description': 'Kurapika is a walking cheat code!',
+ 'upload_date': '20200213',
+ 'channel_id': '2147162',
+ 'uploader_url': 'https://www.patreon.com/yaboyroshi',
+ },
}]
def _real_extract(self, url):
@@ -250,20 +273,13 @@ class PatreonIE(PatreonBaseIE):
v_url = url_or_none(compat_urllib_parse_unquote(
self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False)))
if v_url:
- return {
- **info,
- '_type': 'url_transparent',
- 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'),
- 'ie_key': 'Vimeo',
- }
+ v_url = VimeoIE._smuggle_referrer(v_url, 'https://patreon.com')
+ if self._request_webpage(v_url, video_id, 'Checking Vimeo embed URL', fatal=False, errnote=False):
+ return self.url_result(v_url, VimeoIE, url_transparent=True, **info)
embed_url = try_get(attributes, lambda x: x['embed']['url'])
- if embed_url:
- return {
- **info,
- '_type': 'url',
- 'url': embed_url,
- }
+ if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
+ return self.url_result(embed_url, **info)
post_file = traverse_obj(attributes, 'post_file')
if post_file:
diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py
index 1524a1f..1cebb36 100644
--- a/yt_dlp/extractor/polsatgo.py
+++ b/yt_dlp/extractor/polsatgo.py
@@ -1,5 +1,5 @@
-from uuid import uuid4
import json
+import uuid
from .common import InfoExtractor
from ..utils import (
@@ -51,7 +51,7 @@ class PolsatGoIE(InfoExtractor):
}
def _call_api(self, endpoint, media_id, method, params):
- rand_uuid = str(uuid4())
+ rand_uuid = str(uuid.uuid4())
res = self._download_json(
f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id,
note=f'Downloading {method} JSON metadata',
diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py
index 66f8a5f..3e0ccba 100644
--- a/yt_dlp/extractor/pr0gramm.py
+++ b/yt_dlp/extractor/pr0gramm.py
@@ -1,5 +1,6 @@
+import datetime as dt
import json
-from urllib.parse import unquote
+import urllib.parse
from .common import InfoExtractor
from ..compat import functools
@@ -114,7 +115,7 @@ class Pr0grammIE(InfoExtractor):
cookies = self._get_cookies(self.BASE_URL)
if 'me' not in cookies:
self._download_webpage(self.BASE_URL, None, 'Refreshing verification information')
- if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')):
+ if traverse_obj(cookies, ('me', {lambda x: x.value}, {urllib.parse.unquote}, {json.loads}, 'verified')):
flags |= 0b00110
return flags
@@ -196,6 +197,7 @@ class Pr0grammIE(InfoExtractor):
'like_count': ('up', {int}),
'dislike_count': ('down', {int}),
'timestamp': ('created', {int}),
+ 'upload_date': ('created', {int}, {dt.date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}),
'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)})
}),
}
diff --git a/yt_dlp/extractor/prosiebensat1.py b/yt_dlp/extractor/prosiebensat1.py
index 46e2e8a..4c33bae 100644
--- a/yt_dlp/extractor/prosiebensat1.py
+++ b/yt_dlp/extractor/prosiebensat1.py
@@ -1,6 +1,6 @@
+import hashlib
import re
-from hashlib import sha1
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -42,7 +42,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
'Downloading protocols JSON',
headers=self.geo_verification_headers(), query={
'access_id': self._ACCESS_ID,
- 'client_token': sha1((raw_ct).encode()).hexdigest(),
+ 'client_token': hashlib.sha1((raw_ct).encode()).hexdigest(),
'video_id': clip_id,
}, fatal=False, expected_status=(403,)) or {}
error = protocols.get('error') or {}
@@ -53,7 +53,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
urls = (self._download_json(
self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={
'access_id': self._ACCESS_ID,
- 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(),
+ 'client_token': hashlib.sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(),
'protocols': self._SUPPORTED_PROTOCOLS,
'server_token': server_token,
'video_id': clip_id,
@@ -77,7 +77,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
if not formats:
source_ids = [compat_str(source['id']) for source in video['sources']]
- client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
+ client_id = self._SALT[:2] + hashlib.sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
sources = self._download_json(
'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id,
@@ -96,7 +96,7 @@ class ProSiebenSat1BaseIE(InfoExtractor):
return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
for source_id in source_ids:
- client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
+ client_id = self._SALT[:2] + hashlib.sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
urls = self._download_json(
'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id,
clip_id, 'Downloading urls JSON', fatal=False, query={
diff --git a/yt_dlp/extractor/radiokapital.py b/yt_dlp/extractor/radiokapital.py
index 8f9737a..5d7d3dd 100644
--- a/yt_dlp/extractor/radiokapital.py
+++ b/yt_dlp/extractor/radiokapital.py
@@ -1,18 +1,14 @@
-from .common import InfoExtractor
-from ..utils import (
- clean_html,
- traverse_obj,
- unescapeHTML,
-)
-
import itertools
-from urllib.parse import urlencode
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import clean_html, traverse_obj, unescapeHTML
class RadioKapitalBaseIE(InfoExtractor):
def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}):
return self._download_json(
- f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}',
+ f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urllib.parse.urlencode(qs)}',
video_id, note=note)
def _parse_episode(self, data):
diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py
index 5099f3a..3bc5f3c 100644
--- a/yt_dlp/extractor/rokfin.py
+++ b/yt_dlp/extractor/rokfin.py
@@ -1,8 +1,8 @@
+import datetime as dt
import itertools
import json
import re
import urllib.parse
-from datetime import datetime
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
@@ -156,7 +156,7 @@ class RokfinIE(InfoExtractor):
self.raise_login_required('This video is only available to premium users', True, method='cookies')
elif scheduled:
self.raise_no_formats(
- f'Stream is offline; scheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
+ f'Stream is offline; scheduled for {dt.datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}',
video_id=video_id, expected=True)
uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username'))
diff --git a/yt_dlp/extractor/sejmpl.py b/yt_dlp/extractor/sejmpl.py
index 29cb015..eb433d2 100644
--- a/yt_dlp/extractor/sejmpl.py
+++ b/yt_dlp/extractor/sejmpl.py
@@ -1,4 +1,4 @@
-import datetime
+import datetime as dt
from .common import InfoExtractor
from .redge import RedCDNLivxIE
@@ -13,16 +13,16 @@ from ..utils.traversal import traverse_obj
def is_dst(date):
- last_march = datetime.datetime(date.year, 3, 31)
- last_october = datetime.datetime(date.year, 10, 31)
- last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7)
- last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7)
+ last_march = dt.datetime(date.year, 3, 31)
+ last_october = dt.datetime(date.year, 10, 31)
+ last_sunday_march = last_march - dt.timedelta(days=last_march.isoweekday() % 7)
+ last_sunday_october = last_october - dt.timedelta(days=last_october.isoweekday() % 7)
return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
def rfc3339_to_atende(date):
- date = datetime.datetime.fromisoformat(date)
- date = date + datetime.timedelta(hours=1 if is_dst(date) else 0)
+ date = dt.datetime.fromisoformat(date)
+ date = date + dt.timedelta(hours=1 if is_dst(date) else 0)
return int((date.timestamp() - 978307200) * 1000)
diff --git a/yt_dlp/extractor/sharepoint.py b/yt_dlp/extractor/sharepoint.py
new file mode 100644
index 0000000..d4d5af0
--- /dev/null
+++ b/yt_dlp/extractor/sharepoint.py
@@ -0,0 +1,112 @@
+import json
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import determine_ext, int_or_none, url_or_none
+from ..utils.traversal import traverse_obj
+
+
+class SharePointIE(InfoExtractor):
+ _BASE_URL_RE = r'https?://[\w-]+\.sharepoint\.com/'
+ _VALID_URL = [
+ rf'{_BASE_URL_RE}:v:/[a-z]/(?:[^/?#]+/)*(?P<id>[^/?#]{{46}})/?(?:$|[?#])',
+ rf'{_BASE_URL_RE}(?!:v:)(?:[^/?#]+/)*stream\.aspx\?(?:[^#]+&)?id=(?P<id>[^&#]+)',
+ ]
+ _TESTS = [{
+ 'url': 'https://lut-my.sharepoint.com/:v:/g/personal/juha_eerola_student_lab_fi/EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw?e=ZpQOOw',
+ 'md5': '2950821d0d4937a0a76373782093b435',
+ 'info_dict': {
+ 'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB',
+ 'display_id': 'EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw',
+ 'ext': 'mp4',
+ 'title': 'CmvpJST',
+ 'duration': 54.567,
+ 'thumbnail': r're:https://.+/thumbnail',
+ 'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f',
+ },
+ }, {
+ 'url': 'https://greaternyace.sharepoint.com/:v:/s/acementornydrive/ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg?e=PQUfVb',
+ 'md5': 'c496a01644223273bff12e93e501afd1',
+ 'info_dict': {
+ 'id': '01QI4AVTZ3ESFZPAD42VCKB5CZKAGLFVYB',
+ 'display_id': 'ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg',
+ 'ext': 'mp4',
+ 'title': '930103681233985536',
+ 'duration': 3797.326,
+ 'thumbnail': r're:https://.+/thumbnail',
+ },
+ }, {
+ 'url': 'https://lut-my.sharepoint.com/personal/juha_eerola_student_lab_fi/_layouts/15/stream.aspx?id=%2Fpersonal%2Fjuha_eerola_student_lab_fi%2FDocuments%2FM-DL%2FCmvpJST.mp4&ga=1&referrer=StreamWebApp.Web&referrerScenario=AddressBarCopied.view',
+ 'info_dict': {
+ 'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB',
+ 'display_id': '/personal/juha_eerola_student_lab_fi/Documents/M-DL/CmvpJST.mp4',
+ 'ext': 'mp4',
+ 'title': 'CmvpJST',
+ 'duration': 54.567,
+ 'thumbnail': r're:https://.+/thumbnail',
+ 'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f',
+ },
+ 'skip': 'Session cookies needed',
+ }, {
+ 'url': 'https://izoobasisschool.sharepoint.com/:v:/g/Eaqleq8COVBIvIPvod0U27oBypC6aWOkk8ptuDpmJ6arHw',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://uskudaredutr-my.sharepoint.com/:v:/g/personal/songul_turkaydin_uskudar_edu_tr/EbTf-VRUIbtGuIN73tx1MuwBCHBOmNcWNqSLw61Fd2_o0g?e=n5Vkof',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://epam-my.sharepoint.com/:v:/p/dzmitry_tamashevich/Ec4ZOs-rATZHjFYZWVxjczEB649FCoYFKDV_x3RxZiWAGA?e=4hswgA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://microsoft.sharepoint.com/:v:/t/MicrosoftSPARKRecordings-MSFTInternal/EWCyeqByVWBAt8wDvNZdV-UB0BvU5YVbKm0UHgdrUlI6dg?e=QbPck6',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = urllib.parse.unquote(self._match_id(url))
+ webpage, urlh = self._download_webpage_handle(url, display_id)
+ if urllib.parse.urlparse(urlh.url).hostname == 'login.microsoftonline.com':
+ self.raise_login_required(
+ 'Session cookies are required for this URL and can be passed '
+ 'with the --cookies option. The --cookies-from-browser option will not work', method=None)
+
+ video_data = self._search_json(r'g_fileInfo\s*=', webpage, 'player config', display_id)
+ video_id = video_data['VroomItemId']
+
+ parsed_url = urllib.parse.urlparse(video_data['.transformUrl'])
+ base_media_url = urllib.parse.urlunparse(parsed_url._replace(
+ path=urllib.parse.urljoin(f'{parsed_url.path}/', '../videomanifest'),
+ query=urllib.parse.urlencode({
+ **urllib.parse.parse_qs(parsed_url.query),
+ 'cTag': video_data['.ctag'],
+ 'action': 'Access',
+ 'part': 'index',
+ }, doseq=True)))
+
+ # Web player adds more params to the format URLs but we still get all formats without them
+ formats = self._extract_mpd_formats(
+ base_media_url, video_id, mpd_id='dash', query={'format': 'dash'}, fatal=False)
+ for hls_type in ('hls', 'hls-vnext'):
+ formats.extend(self._extract_m3u8_formats(
+ base_media_url, video_id, 'mp4', m3u8_id=hls_type,
+ query={'format': hls_type}, fatal=False, quality=-2))
+
+ if video_url := traverse_obj(video_data, ('downloadUrl', {url_or_none})):
+ formats.append({
+ 'url': video_url,
+ 'ext': determine_ext(video_data.get('extension') or video_data.get('name')),
+ 'quality': 1,
+ 'format_id': 'source',
+ 'filesize': int_or_none(video_data.get('size')),
+ 'vcodec': 'none' if video_data.get('isAudio') is True else None,
+ })
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': video_data.get('title') or video_data.get('displayName'),
+ 'display_id': display_id,
+ 'uploader_id': video_data.get('authorId'),
+ 'duration': traverse_obj(video_data, (
+ 'MediaServiceFastMetadata', {json.loads}, 'media', 'duration', {lambda x: x / 10000000})),
+ 'thumbnail': url_or_none(video_data.get('thumbnailUrl')),
+ }
diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py
index 4379572..7c914ac 100644
--- a/yt_dlp/extractor/sonyliv.py
+++ b/yt_dlp/extractor/sonyliv.py
@@ -1,4 +1,5 @@
-import datetime
+import datetime as dt
+import itertools
import json
import math
import random
@@ -12,8 +13,8 @@ from ..utils import (
int_or_none,
jwt_decode_hs256,
try_call,
- try_get,
)
+from ..utils.traversal import traverse_obj
class SonyLIVIE(InfoExtractor):
@@ -93,7 +94,7 @@ class SonyLIVIE(InfoExtractor):
'mobileNumber': username,
'channelPartnerID': 'MSMIND',
'country': 'IN',
- 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
+ 'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
'otpSize': 6,
'loginType': 'REGISTERORSIGNIN',
'isMobileMandatory': True,
@@ -110,7 +111,7 @@ class SonyLIVIE(InfoExtractor):
'otp': self._get_tfa_info('OTP'),
'dmaId': 'IN',
'ageConfirmation': True,
- 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
+ 'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'),
'isMobileMandatory': True,
}).encode())
if otp_verify_json['resultCode'] == 'KO':
@@ -183,17 +184,21 @@ class SonyLIVIE(InfoExtractor):
class SonyLIVSeriesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/shows/[^/?#&]+-(?P<id>\d{10})$'
+ _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/shows/[^/?#&]+-(?P<id>\d{10})/?(?:$|[?#])'
_TESTS = [{
'url': 'https://www.sonyliv.com/shows/adaalat-1700000091',
- 'playlist_mincount': 456,
+ 'playlist_mincount': 452,
'info_dict': {
'id': '1700000091',
},
+ }, {
+ 'url': 'https://www.sonyliv.com/shows/beyhadh-1700000007/',
+ 'playlist_mincount': 358,
+ 'info_dict': {
+ 'id': '1700000007',
+ },
}]
- _API_SHOW_URL = "https://apiv2.sonyliv.com/AGL/1.9/R/ENG/WEB/IN/DL/DETAIL/{}?kids_safe=false&from=0&to=49"
- _API_EPISODES_URL = "https://apiv2.sonyliv.com/AGL/1.4/R/ENG/WEB/IN/CONTENT/DETAIL/BUNDLE/{}?from=0&to=1000&orderBy=episodeNumber&sortOrder=asc"
- _API_SECURITY_URL = 'https://apiv2.sonyliv.com/AGL/1.4/A/ENG/WEB/ALL/GETTOKEN'
+ _API_BASE = 'https://apiv2.sonyliv.com/AGL'
def _entries(self, show_id):
headers = {
@@ -201,19 +206,34 @@ class SonyLIVSeriesIE(InfoExtractor):
'Referer': 'https://www.sonyliv.com',
}
headers['security_token'] = self._download_json(
- self._API_SECURITY_URL, video_id=show_id, headers=headers,
- note='Downloading security token')['resultObj']
- seasons = try_get(
- self._download_json(self._API_SHOW_URL.format(show_id), video_id=show_id, headers=headers),
- lambda x: x['resultObj']['containers'][0]['containers'], list)
- for season in seasons or []:
- season_id = season['id']
- episodes = try_get(
- self._download_json(self._API_EPISODES_URL.format(season_id), video_id=season_id, headers=headers),
- lambda x: x['resultObj']['containers'][0]['containers'], list)
- for episode in episodes or []:
- video_id = episode.get('id')
- yield self.url_result('sonyliv:%s' % video_id, ie=SonyLIVIE.ie_key(), video_id=video_id)
+ f'{self._API_BASE}/1.4/A/ENG/WEB/ALL/GETTOKEN', show_id,
+ 'Downloading security token', headers=headers)['resultObj']
+ seasons = traverse_obj(self._download_json(
+ f'{self._API_BASE}/1.9/R/ENG/WEB/IN/DL/DETAIL/{show_id}', show_id,
+ 'Downloading series JSON', headers=headers, query={
+ 'kids_safe': 'false',
+ 'from': '0',
+ 'to': '49',
+ }), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id'])))
+ for season in seasons:
+ season_id = str(season['id'])
+ note = traverse_obj(season, ('metadata', 'title', {str})) or 'season'
+ cursor = 0
+ for page_num in itertools.count(1):
+ episodes = traverse_obj(self._download_json(
+ f'{self._API_BASE}/1.4/R/ENG/WEB/IN/CONTENT/DETAIL/BUNDLE/{season_id}',
+ season_id, f'Downloading {note} page {page_num} JSON', headers=headers, query={
+ 'from': str(cursor),
+ 'to': str(cursor + 99),
+ 'orderBy': 'episodeNumber',
+ 'sortOrder': 'asc',
+ }), ('resultObj', 'containers', 0, 'containers', lambda _, v: int_or_none(v['id'])))
+ if not episodes:
+ break
+ for episode in episodes:
+ video_id = str(episode['id'])
+ yield self.url_result(f'sonyliv:{video_id}', SonyLIVIE, video_id)
+ cursor += 100
def _real_extract(self, url):
show_id = self._match_id(url)
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
index a7c2afd..c9ed645 100644
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -1,30 +1,27 @@
import itertools
-import re
import json
-# import random
+import re
-from .common import (
- InfoExtractor,
- SearchInfoExtractor
-)
+from .common import InfoExtractor, SearchInfoExtractor
from ..compat import compat_str
-from ..networking import HEADRequest, Request
+from ..networking import HEADRequest
from ..networking.exceptions import HTTPError
from ..utils import (
- error_to_compat_str,
+ KNOWN_EXTENSIONS,
ExtractorError,
+ error_to_compat_str,
float_or_none,
int_or_none,
- KNOWN_EXTENSIONS,
mimetype2ext,
parse_qs,
str_or_none,
- try_get,
+ try_call,
unified_timestamp,
update_url_query,
url_or_none,
urlhandle_detect_ext,
)
+from ..utils.traversal import traverse_obj
class SoundcloudEmbedIE(InfoExtractor):
@@ -54,7 +51,6 @@ class SoundcloudBaseIE(InfoExtractor):
_API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
_API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
_API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s'
- _access_token = None
_HEADERS = {}
_IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
@@ -112,21 +108,31 @@ class SoundcloudBaseIE(InfoExtractor):
def _initialize_pre_login(self):
self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf'
+ def _verify_oauth_token(self, token):
+ if self._request_webpage(
+ self._API_VERIFY_AUTH_TOKEN % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
+ None, note='Verifying login token...', fatal=False,
+ data=json.dumps({'session': {'access_token': token}}).encode()):
+ self._HEADERS['Authorization'] = f'OAuth {token}'
+ self.report_login()
+ else:
+ self.report_warning('Provided authorization token is invalid. Continuing as guest')
+
+ def _real_initialize(self):
+ if self._HEADERS:
+ return
+ if token := try_call(lambda: self._get_cookies(self._BASE_URL)['oauth_token'].value):
+ self._verify_oauth_token(token)
+
def _perform_login(self, username, password):
if username != 'oauth':
- self.report_warning(
+ raise ExtractorError(
'Login using username and password is not currently supported. '
- 'Use "--username oauth --password <oauth_token>" to login using an oauth token')
- self._access_token = password
- query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
- payload = {'session': {'access_token': self._access_token}}
- token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8'))
- response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False)
- if response is not False:
- self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
- self.report_login()
- else:
- self.report_warning('Provided authorization token seems to be invalid. Continue as guest')
+ 'Use "--username oauth --password <oauth_token>" to login using an oauth token, '
+ f'or else {self._login_hint(method="cookies")}', expected=True)
+ if self._HEADERS:
+ return
+ self._verify_oauth_token(password)
r'''
def genDevId():
@@ -147,14 +153,17 @@ class SoundcloudBaseIE(InfoExtractor):
'user_agent': self._USER_AGENT
}
- query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
- login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
- response = self._download_json(login, None)
- self._access_token = response.get('session').get('access_token')
- if not self._access_token:
- self.report_warning('Unable to get access token, login may has failed')
- else:
- self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+ response = self._download_json(
+ self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID),
+ None, note='Verifying login token...', fatal=False,
+ data=json.dumps(payload).encode())
+
+ if token := traverse_obj(response, ('session', 'access_token', {str})):
+ self._HEADERS['Authorization'] = f'OAuth {token}'
+ self.report_login()
+ return
+
+ raise ExtractorError('Unable to get access token, login may have failed', expected=True)
'''
# signature generation
@@ -217,6 +226,7 @@ class SoundcloudBaseIE(InfoExtractor):
'filesize': int_or_none(urlh.headers.get('Content-Length')),
'url': format_url,
'quality': 10,
+ 'format_note': 'Original',
})
def invalid_url(url):
@@ -233,9 +243,13 @@ class SoundcloudBaseIE(InfoExtractor):
format_id_list.append(protocol)
ext = f.get('ext')
if ext == 'aac':
- f['abr'] = '256'
+ f.update({
+ 'abr': 256,
+ 'quality': 5,
+ 'format_note': 'Premium',
+ })
for k in ('ext', 'abr'):
- v = f.get(k)
+ v = str_or_none(f.get(k))
if v:
format_id_list.append(v)
preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url'])
@@ -256,16 +270,25 @@ class SoundcloudBaseIE(InfoExtractor):
formats.append(f)
# New API
- transcodings = try_get(
- info, lambda x: x['media']['transcodings'], list) or []
- for t in transcodings:
- if not isinstance(t, dict):
- continue
- format_url = url_or_none(t.get('url'))
- if not format_url:
- continue
- stream = None if extract_flat else self._download_json(
- format_url, track_id, query=query, fatal=False, headers=self._HEADERS)
+ for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))):
+ if extract_flat:
+ break
+ format_url = t['url']
+ stream = None
+
+ for retry in self.RetryManager(fatal=False):
+ try:
+ stream = self._download_json(format_url, track_id, query=query, headers=self._HEADERS)
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and e.cause.status == 429:
+ self.report_warning(
+ 'You have reached the API rate limit, which is ~600 requests per '
+ '10 minutes. Use the --extractor-retries and --retry-sleep options '
+ 'to configure an appropriate retry count and wait time', only_once=True)
+ retry.error = e.cause
+ else:
+ self.report_warning(e.msg)
+
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('url'))
diff --git a/yt_dlp/extractor/telewebion.py b/yt_dlp/extractor/telewebion.py
index 9378ed0..5fdcddd 100644
--- a/yt_dlp/extractor/telewebion.py
+++ b/yt_dlp/extractor/telewebion.py
@@ -1,8 +1,7 @@
from __future__ import annotations
-
+import functools
import json
-from functools import partial
-from textwrap import dedent
+import textwrap
from .common import InfoExtractor
from ..utils import ExtractorError, format_field, int_or_none, parse_iso8601
@@ -10,7 +9,7 @@ from ..utils.traversal import traverse_obj
def _fmt_url(url):
- return partial(format_field, template=url, default=None)
+ return functools.partial(format_field, template=url, default=None)
class TelewebionIE(InfoExtractor):
@@ -88,7 +87,7 @@ class TelewebionIE(InfoExtractor):
if not video_id.startswith('0x'):
video_id = hex(int(video_id))
- episode_data = self._call_graphql_api('getEpisodeDetail', video_id, dedent('''
+ episode_data = self._call_graphql_api('getEpisodeDetail', video_id, textwrap.dedent('''
queryEpisode(filter: {EpisodeID: $EpisodeId}, first: 1) {
title
program {
@@ -127,7 +126,7 @@ class TelewebionIE(InfoExtractor):
'formats': (
'channel', 'descriptor', {str},
{_fmt_url(f'https://cdna.telewebion.com/%s/episode/{video_id}/playlist.m3u8')},
- {partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}),
+ {functools.partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}),
}))
info_dict['id'] = video_id
return info_dict
diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py
index a98275d..11cc570 100644
--- a/yt_dlp/extractor/tenplay.py
+++ b/yt_dlp/extractor/tenplay.py
@@ -1,7 +1,7 @@
import base64
+import datetime as dt
import functools
import itertools
-from datetime import datetime
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -70,7 +70,7 @@ class TenPlayIE(InfoExtractor):
username, password = self._get_login_info()
if username is None or password is None:
self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.')
- _timestamp = datetime.now().strftime('%Y%m%d000000')
+ _timestamp = dt.datetime.now().strftime('%Y%m%d000000')
_auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii')
data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={
'X-Network-Ten-Auth': _auth_header,
diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py
index 15f8380..fbc12d5 100644
--- a/yt_dlp/extractor/thisoldhouse.py
+++ b/yt_dlp/extractor/thisoldhouse.py
@@ -1,5 +1,6 @@
import json
+from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from .zype import ZypeIE
from ..networking import HEADRequest
@@ -8,6 +9,7 @@ from ..utils import (
ExtractorError,
filter_dict,
parse_qs,
+ smuggle_url,
try_call,
urlencode_postdata,
)
@@ -17,24 +19,44 @@ class ThisOldHouseIE(InfoExtractor):
_NETRC_MACHINE = 'thisoldhouse'
_VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P<id>[^/?#]+)'
_TESTS = [{
+ # Unresolved Brightcove URL embed (formerly Zype), free
'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench',
'info_dict': {
- 'id': '5dcdddf673c3f956ef5db202',
+ 'id': '6325298523112',
'ext': 'mp4',
'title': 'How to Build a Storage Bench',
'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
- 'timestamp': 1442548800,
- 'upload_date': '20150918',
- 'duration': 674,
- 'view_count': int,
- 'average_rating': 0,
- 'thumbnail': r're:^https?://.*\.jpg\?\d+$',
- 'display_id': 'how-to-build-a-storage-bench',
+ 'timestamp': 1681793639,
+ 'upload_date': '20230418',
+ 'duration': 674.54,
+ 'tags': 'count:11',
+ 'uploader_id': '6314471934001',
+ 'thumbnail': r're:^https?://.*\.jpg',
},
'params': {
'skip_download': True,
},
}, {
+ # Brightcove embed, authwalled
+ 'url': 'https://www.thisoldhouse.com/glen-ridge-generational/99537/s45-e17-multi-generational',
+ 'info_dict': {
+ 'id': '6349675446112',
+ 'ext': 'mp4',
+ 'title': 'E17 | Glen Ridge Generational | Multi-Generational',
+ 'description': 'md5:53c6bc2e8031f3033d693d9a3563222c',
+ 'timestamp': 1711382202,
+ 'upload_date': '20240325',
+ 'duration': 1422.229,
+ 'tags': 'count:13',
+ 'uploader_id': '6314471934001',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ },
+ 'expected_warnings': ['Login with password is not supported for this website'],
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Requires subscription',
+ }, {
# Page no longer has video
'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
'only_matching': True,
@@ -98,7 +120,15 @@ class ThisOldHouseIE(InfoExtractor):
video_url, video_id = self._search_regex(
r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]',
- webpage, 'video url', group=(1, 2))
- video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url
+ webpage, 'zype url', group=(1, 2), default=(None, None))
+ if video_url:
+ video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url
+ return self.url_result(video_url, ZypeIE, video_id)
- return self.url_result(video_url, ZypeIE, video_id)
+ video_url, video_id = self._search_regex([
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//players\.brightcove\.net/\d+/\w+/index\.html\?videoId=(\d+))',
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)thisoldhouse\.com/videos/brightcove/(\d+))'],
+ webpage, 'iframe url', group=(1, 2))
+ if not parse_qs(video_url).get('videoId'):
+ video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Brightcove URL').url
+ return self.url_result(smuggle_url(video_url, {'referrer': url}), BrightcoveNewIE, video_id)
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
index aa83567..3f5261a 100644
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -4,6 +4,7 @@ import random
import re
import string
import time
+import uuid
from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
@@ -30,18 +31,64 @@ from ..utils import (
class TikTokBaseIE(InfoExtractor):
- _APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')]
- _WORKING_APP_VERSION = None
- _APP_NAME = 'trill'
- _AID = 1180
_UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
_WEBPAGE_HOST = 'https://www.tiktok.com/'
QUALITIES = ('360p', '540p', '720p', '1080p')
+ _APP_INFO_DEFAULTS = {
+ # unique "install id"
+ 'iid': None,
+ # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
+ 'app_name': 'musical_ly',
+ 'app_version': '34.1.2',
+ 'manifest_app_version': '2023401020',
+ # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
+ 'aid': '0',
+ }
+ _KNOWN_APP_INFO = [
+ '7351144126450059040',
+ '7351149742343391009',
+ '7351153174894626592',
+ ]
+ _APP_INFO_POOL = None
+ _APP_INFO = None
+ _APP_USER_AGENT = None
+
@property
def _API_HOSTNAME(self):
return self._configuration_arg(
- 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
+ 'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0]
+
+ def _get_next_app_info(self):
+ if self._APP_INFO_POOL is None:
+ defaults = {
+ key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
+ for key, default in self._APP_INFO_DEFAULTS.items()
+ if key != 'iid'
+ }
+ app_info_list = (
+ self._configuration_arg('app_info', ie_key=TikTokIE)
+ or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO)))
+ self._APP_INFO_POOL = [
+ {**defaults, **dict(
+ (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
+ )} for app_info in app_info_list
+ ]
+
+ if not self._APP_INFO_POOL:
+ return False
+
+ self._APP_INFO = self._APP_INFO_POOL.pop(0)
+
+ app_name = self._APP_INFO['app_name']
+ version = self._APP_INFO['manifest_app_version']
+ if app_name == 'musical_ly':
+ package = f'com.zhiliaoapp.musically/{version}'
+ else: # trill, aweme
+ package = f'com.ss.android.ugc.{app_name}/{version}'
+ self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
+
+ return True
@staticmethod
def _create_url(user_id, video_id):
@@ -58,7 +105,7 @@ class TikTokBaseIE(InfoExtractor):
'universal data', display_id, end_pattern=r'</script>', default={}),
('__DEFAULT_SCOPE__', {dict})) or {}
- def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True,
+ def _call_api_impl(self, ep, query, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
@@ -67,80 +114,85 @@ class TikTokBaseIE(InfoExtractor):
return self._download_json(
'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id,
fatal=fatal, note=note, errnote=errnote, headers={
- 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)',
+ 'User-Agent': self._APP_USER_AGENT,
'Accept': 'application/json',
}, query=query)
- def _build_api_query(self, query, app_version, manifest_app_version):
+ def _build_api_query(self, query):
return {
**query,
- 'version_name': app_version,
- 'version_code': manifest_app_version,
- 'build_number': app_version,
- 'manifest_version_code': manifest_app_version,
- 'update_version_code': manifest_app_version,
- 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
- 'uuid': ''.join(random.choices(string.digits, k=16)),
- '_rticket': int(time.time() * 1000),
- 'ts': int(time.time()),
- 'device_brand': 'Google',
- 'device_type': 'Pixel 7',
'device_platform': 'android',
+ 'os': 'android',
+ 'ssmix': 'a',
+ '_rticket': int(time.time() * 1000),
+ 'cdid': str(uuid.uuid4()),
+ 'channel': 'googleplay',
+ 'aid': self._APP_INFO['aid'],
+ 'app_name': self._APP_INFO['app_name'],
+ 'version_code': ''.join((f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.'))),
+ 'version_name': self._APP_INFO['app_version'],
+ 'manifest_version_code': self._APP_INFO['manifest_app_version'],
+ 'update_version_code': self._APP_INFO['manifest_app_version'],
+ 'ab_version': self._APP_INFO['app_version'],
'resolution': '1080*2400',
'dpi': 420,
- 'os_version': '13',
+ 'device_type': 'Pixel 7',
+ 'device_brand': 'Google',
+ 'language': 'en',
'os_api': '29',
- 'carrier_region': 'US',
+ 'os_version': '13',
+ 'ac': 'wifi',
+ 'is_pad': '0',
+ 'current_region': 'US',
+ 'app_type': 'normal',
'sys_region': 'US',
- 'region': 'US',
- 'app_name': self._APP_NAME,
- 'app_language': 'en',
- 'language': 'en',
+ 'last_install_time': int(time.time()) - random.randint(86400, 1123200),
'timezone_name': 'America/New_York',
+ 'residence': 'US',
+ 'app_language': 'en',
'timezone_offset': '-14400',
- 'channel': 'googleplay',
- 'ac': 'wifi',
- 'mcc_mnc': '310260',
- 'is_my_cn': 0,
- 'aid': self._AID,
- 'ssmix': 'a',
- 'as': 'a1qwert123',
- 'cp': 'cbfhckdckkde1',
+ 'host_abi': 'armeabi-v7a',
+ 'locale': 'en',
+ 'ac2': 'wifi5g',
+ 'uoo': '1',
+ 'carrier_region': 'US',
+ 'op_region': 'US',
+ 'build_number': self._APP_INFO['app_version'],
+ 'region': 'US',
+ 'ts': int(time.time()),
+ 'iid': self._APP_INFO['iid'],
+ 'device_id': random.randint(7250000000000000000, 7351147085025500000),
+ 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
}
def _call_api(self, ep, query, video_id, fatal=True,
note='Downloading API JSON', errnote='Unable to download API page'):
- if not self._WORKING_APP_VERSION:
- app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0]
- manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0]
- if app_version and manifest_app_version:
- self._WORKING_APP_VERSION = (app_version, manifest_app_version)
- self.write_debug('Imported app version combo from extractor arguments')
- elif app_version or manifest_app_version:
- self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True)
-
- if self._WORKING_APP_VERSION:
- app_version, manifest_app_version = self._WORKING_APP_VERSION
- real_query = self._build_api_query(query, app_version, manifest_app_version)
- return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
-
- for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1):
- real_query = self._build_api_query(query, app_version, manifest_app_version)
+ if not self._APP_INFO and not self._get_next_app_info():
+ message = 'No working app info is available'
+ if fatal:
+ raise ExtractorError(message, expected=True)
+ else:
+ self.report_warning(message)
+ return
+
+ max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
+ for count in itertools.count(1):
+ self.write_debug(str(self._APP_INFO))
+ real_query = self._build_api_query(query)
try:
- res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote)
- self._WORKING_APP_VERSION = (app_version, manifest_app_version)
- return res
+ return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote)
except ExtractorError as e:
if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
- if count == len(self._APP_VERSIONS):
+ message = str(e.cause or e.msg)
+ if not self._get_next_app_info():
if fatal:
- raise e
+ raise
else:
- self.report_warning(str(e.cause or e.msg))
+ self.report_warning(message)
return
- self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS)))
+ self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
continue
- raise e
+ raise
def _extract_aweme_app(self, aweme_id):
feed_list = self._call_api(
@@ -223,6 +275,7 @@ class TikTokBaseIE(InfoExtractor):
def extract_addr(addr, add_meta={}):
parsed_meta, res = parse_url_key(addr.get('url_key', ''))
+ is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
if res:
known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
@@ -235,8 +288,11 @@ class TikTokBaseIE(InfoExtractor):
'acodec': 'aac',
'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
**add_meta, **parsed_meta,
+ # bytevc2 is bytedance's proprietary (unplayable) video codec
+ 'preference': -100 if is_bytevc2 else -1,
'format_note': join_nonempty(
- add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '),
+ add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
+ '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
**audio_meta(url),
} for url in addr.get('url_list') or []]
diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py
index c55786a..80cba09 100644
--- a/yt_dlp/extractor/twitch.py
+++ b/yt_dlp/extractor/twitch.py
@@ -191,17 +191,25 @@ class TwitchBaseIE(InfoExtractor):
}] if thumbnail else None
def _extract_twitch_m3u8_formats(self, path, video_id, token, signature):
- return self._extract_m3u8_formats(
+ formats = self._extract_m3u8_formats(
f'{self._USHER_BASE}/{path}/{video_id}.m3u8', video_id, 'mp4', query={
'allow_source': 'true',
'allow_audio_only': 'true',
'allow_spectre': 'true',
'p': random.randint(1000000, 10000000),
+ 'platform': 'web',
'player': 'twitchweb',
+ 'supported_codecs': 'av1,h265,h264',
'playlist_include_framerate': 'true',
'sig': signature,
'token': token,
})
+ for fmt in formats:
+ if fmt.get('vcodec') and fmt['vcodec'].startswith('av01'):
+ # mpegts does not yet have proper support for av1
+ fmt['downloader_options'] = {'ffmpeg_args_out': ['-f', 'mp4']}
+
+ return formats
class TwitchVodIE(TwitchBaseIE):
diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py
index e4a78c2..7e3a3a9 100644
--- a/yt_dlp/extractor/vk.py
+++ b/yt_dlp/extractor/vk.py
@@ -707,6 +707,7 @@ class VKWallPostIE(VKBaseIE):
class VKPlayBaseIE(InfoExtractor):
+ _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vkplay\.ru)/'
_RESOLUTIONS = {
'tiny': '256x144',
'lowest': '426x240',
@@ -765,7 +766,7 @@ class VKPlayBaseIE(InfoExtractor):
class VKPlayIE(VKPlayBaseIE):
- _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/#?]+)/record/(?P<id>[a-f0-9-]+)'
+ _VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P<username>[^/#?]+)/record/(?P<id>[\da-f-]+)'
_TESTS = [{
'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da',
'info_dict': {
@@ -776,13 +777,16 @@ class VKPlayIE(VKPlayBaseIE):
'uploader_id': '13159830',
'release_timestamp': 1683461378,
'release_date': '20230507',
- 'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+',
+ 'thumbnail': r're:https://[^/]+/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview',
'duration': 10608,
'view_count': int,
'like_count': int,
'categories': ['Atomic Heart'],
},
'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://live.vkplay.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -802,7 +806,7 @@ class VKPlayIE(VKPlayBaseIE):
class VKPlayLiveIE(VKPlayBaseIE):
- _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/#?]+)/?(?:[#?]|$)'
+ _VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P<id>[^/#?]+)/?(?:[#?]|$)'
_TESTS = [{
'url': 'https://vkplay.live/bayda',
'info_dict': {
@@ -813,7 +817,7 @@ class VKPlayLiveIE(VKPlayBaseIE):
'uploader_id': '12279401',
'release_timestamp': 1687209962,
'release_date': '20230619',
- 'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+',
+ 'thumbnail': r're:https://[^/]+/public_video_stream/12279401/preview',
'view_count': int,
'concurrent_view_count': int,
'like_count': int,
@@ -822,6 +826,9 @@ class VKPlayLiveIE(VKPlayBaseIE):
},
'skip': 'livestream',
'params': {'skip_download': True},
+ }, {
+ 'url': 'https://live.vkplay.ru/lebwa',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py
index 497233d..3d26549 100644
--- a/yt_dlp/extractor/vrt.py
+++ b/yt_dlp/extractor/vrt.py
@@ -16,6 +16,7 @@ from ..utils import (
join_nonempty,
jwt_encode_hs256,
make_archive_id,
+ merge_dicts,
parse_age_limit,
parse_iso8601,
str_or_none,
@@ -425,3 +426,64 @@ class DagelijkseKostIE(VRTBaseIE):
['description', 'twitter:description', 'og:description'], webpage),
'_old_archive_ids': [make_archive_id('Canvas', video_id)],
}
+
+
+class Radio1BeIE(VRTBaseIE):
+ _VALID_URL = r'https?://radio1\.be/(?:lees|luister/select)/(?P<id>[\w/-]+)'
+ _TESTS = [{
+ 'url': 'https://radio1.be/luister/select/de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie',
+ 'info_dict': {
+ 'id': 'eb6c22e9-544f-44f4-af39-cf8cccd29e22',
+ 'title': 'Komt N-VA volgend jaar op in Wallonië?',
+ 'display_id': 'de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie',
+ 'description': 'md5:b374ea1c9302f38362df9dea1931468e',
+ 'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+'
+ },
+ 'playlist_mincount': 1
+ }, {
+ 'url': 'https://radio1.be/lees/europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza?view=web',
+ 'info_dict': {
+ 'id': '5d47f102-dbdb-4fa0-832b-26c1870311f2',
+ 'title': 'Europese Unie wil "onmiddellijke humanitaire pauze" en "duurzaam staakt-het-vuren" in Gaza',
+ 'description': 'md5:1aad1fae7d39edeffde5d3e67d276b64',
+ 'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+',
+ 'display_id': 'europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza'
+ },
+ 'playlist_mincount': 1
+ }]
+
+ def _extract_video_entries(self, next_js_data, display_id):
+ video_data = traverse_obj(
+ next_js_data, ((None, ('paragraphs', ...)), {lambda x: x if x['mediaReference'] else None}))
+ for data in video_data:
+ media_reference = data['mediaReference']
+ formats, subtitles = self._extract_formats_and_subtitles(
+ self._call_api(media_reference), display_id)
+
+ yield {
+ 'id': media_reference,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ **traverse_obj(data, {
+ 'title': ('title', {str}),
+ 'description': ('body', {clean_html})
+ }),
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ next_js_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['item']
+
+ return self.playlist_result(
+ self._extract_video_entries(next_js_data, display_id), **merge_dicts(traverse_obj(
+ next_js_data, ({
+ 'id': ('id', {str}),
+ 'title': ('title', {str}),
+ 'description': (('description', 'content'), {clean_html}),
+ }), get_all=False), {
+ 'display_id': display_id,
+ 'title': self._html_search_meta(['name', 'og:title', 'twitter:title'], webpage),
+ 'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage),
+ 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage),
+ }))
diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py
index bce5e83..f2256fd 100644
--- a/yt_dlp/extractor/wistia.py
+++ b/yt_dlp/extractor/wistia.py
@@ -1,6 +1,6 @@
+import base64
import re
import urllib.parse
-from base64 import b64decode
from .common import InfoExtractor
from ..networking import HEADRequest
@@ -371,7 +371,7 @@ class WistiaChannelIE(WistiaBaseIE):
webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id)
data = self._parse_json(
self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id),
- channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8')))
+ channel_id, transform_source=lambda x: urllib.parse.unquote_plus(base64.b64decode(x).decode('utf-8')))
# XXX: can there be more than one series?
series = traverse_obj(data, ('series', 0), default={})
diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py
index 5df0715..59eef84 100644
--- a/yt_dlp/extractor/xvideos.py
+++ b/yt_dlp/extractor/xvideos.py
@@ -15,35 +15,35 @@ class XVideosIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[^/]+\.)?xvideos2?\.com/video|
- (?:www\.)?xvideos\.es/video|
+ (?:[^/]+\.)?xvideos2?\.com/video\.?|
+ (?:www\.)?xvideos\.es/video\.?|
(?:www|flashservice)\.xvideos\.com/embedframe/|
static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
)
- (?P<id>[0-9]+)
+ (?P<id>[0-9a-z]+)
'''
_TESTS = [{
- 'url': 'https://www.xvideos.com/video4588838/motorcycle_guy_cucks_influencer_steals_his_gf',
- 'md5': '14cea69fcb84db54293b1e971466c2e1',
+ 'url': 'http://xvideos.com/video.ucuvbkfda4e/a_beautiful_red-haired_stranger_was_refused_but_still_came_to_my_room_for_sex',
+ 'md5': '396255a900a6bddb3e98985f0b86c3fd',
'info_dict': {
- 'id': '4588838',
+ 'id': 'ucuvbkfda4e',
'ext': 'mp4',
- 'title': 'Motorcycle Guy Cucks Influencer, Steals his GF',
- 'duration': 108,
+ 'title': 'A Beautiful Red-Haired Stranger Was Refused, But Still Came To My Room For Sex',
+ 'duration': 1238,
'age_limit': 18,
- 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
+ 'thumbnail': r're:^https://cdn\d+-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
# Broken HLS formats
'url': 'https://www.xvideos.com/video65982001/what_s_her_name',
- 'md5': 'b82d7d7ef7d65a84b1fa6965f81f95a5',
+ 'md5': '56742808292c8fa1418e4538c262c58b',
'info_dict': {
'id': '65982001',
'ext': 'mp4',
'title': 'what\'s her name?',
'duration': 120,
'age_limit': 18,
- 'thumbnail': r're:^https://img-hw.xvideos-cdn.com/.+\.jpg',
+ 'thumbnail': r're:^https://cdn\d+-pic.xvideos-cdn.com/.+\.jpg',
}
}, {
'url': 'https://flashservice.xvideos.com/embedframe/4588838',
@@ -90,6 +90,18 @@ class XVideosIE(InfoExtractor):
}, {
'url': 'https://de.xvideos.com/video4588838/biker_takes_his_girl',
'only_matching': True
+ }, {
+ 'url': 'https://flashservice.xvideos.com/embedframe/ucuvbkfda4e',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.xvideos.com/embedframe/ucuvbkfda4e',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=ucuvbkfda4e',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://xvideos.es/video.ucuvbkfda4e/a_beautiful_red-haired_stranger_was_refused_but_still_came_to_my_room_for_sex',
+ 'only_matching': True
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 33fd3b4..e553fff 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2,7 +2,7 @@ import base64
import calendar
import collections
import copy
-import datetime
+import datetime as dt
import enum
import hashlib
import itertools
@@ -33,6 +33,7 @@ from ..utils import (
clean_html,
datetime_from_str,
dict_get,
+ filesize_from_tbr,
filter_dict,
float_or_none,
format_field,
@@ -55,6 +56,7 @@ from ..utils import (
str_to_int,
strftime_or_none,
traverse_obj,
+ try_call,
try_get,
unescapeHTML,
unified_strdate,
@@ -922,10 +924,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _parse_time_text(self, text):
if not text:
return
- dt = self.extract_relative_time(text)
+ dt_ = self.extract_relative_time(text)
timestamp = None
- if isinstance(dt, datetime.datetime):
- timestamp = calendar.timegm(dt.timetuple())
+ if isinstance(dt_, dt.datetime):
+ timestamp = calendar.timegm(dt_.timetuple())
if timestamp is None:
timestamp = (
@@ -3602,8 +3604,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
yt_query = {
'videoId': video_id,
}
- if _split_innertube_client(client)[0] == 'android':
- yt_query['params'] = 'CgIQBg=='
+ if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'):
+ yt_query['params'] = 'CgIIAQ=='
pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
if pp_arg:
@@ -3839,11 +3841,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
10 if audio_track.get('audioIsDefault') and 10
else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10
else -1)
+ format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)}))
# Some formats may have much smaller duration than others (possibly damaged during encoding)
# E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823
# Make sure to avoid false positives with small duration differences.
# E.g. __2ABJjxzNo, ySuUZEjARPY
- is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500)
+ is_damaged = try_call(lambda: format_duration < duration // 2)
if is_damaged:
self.report_warning(
f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True)
@@ -3873,6 +3876,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'quality': q(quality) - bool(fmt.get('isDrc')) / 2,
'has_drm': bool(fmt.get('drmFamilies')),
'tbr': tbr,
+ 'filesize_approx': filesize_from_tbr(tbr, format_duration),
'url': fmt_url,
'width': int_or_none(fmt.get('width')),
'language': join_nonempty(audio_track.get('id', '').split('.')[0],
@@ -4564,7 +4568,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'):
# Newly uploaded videos' HLS formats are potentially problematic and need to be checked
- upload_datetime = datetime_from_str(upload_date).replace(tzinfo=datetime.timezone.utc)
+ upload_datetime = datetime_from_str(upload_date).replace(tzinfo=dt.timezone.utc)
if upload_datetime >= datetime_from_str('today-2days'):
for fmt in info['formats']:
if fmt.get('protocol') == 'm3u8_native':
@@ -6965,7 +6969,7 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_DESC = 'YouTube search'
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
- _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only
+ _SEARCH_PARAMS = 'EgIQAfABAQ==' # Videos only
_TESTS = [{
'url': 'ytsearch5:youtube-dl test video',
'playlist_count': 5,
@@ -6973,6 +6977,14 @@ class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
'id': 'youtube-dl test video',
'title': 'youtube-dl test video',
}
+ }, {
+ 'note': 'Suicide/self-harm search warning',
+ 'url': 'ytsearch1:i hate myself and i wanna die',
+ 'playlist_count': 1,
+ 'info_dict': {
+ 'id': 'i hate myself and i wanna die',
+ 'title': 'i hate myself and i wanna die',
+ }
}]
@@ -6980,7 +6992,7 @@ class YoutubeSearchDateIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = 'YouTube search, newest videos first'
- _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date
+ _SEARCH_PARAMS = 'CAISAhAB8AEB' # Videos only, sorted by date
_TESTS = [{
'url': 'ytsearchdate5:youtube-dl test video',
'playlist_count': 5,
diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py
index 6bd9ea0..5cc9c5f 100644
--- a/yt_dlp/extractor/zattoo.py
+++ b/yt_dlp/extractor/zattoo.py
@@ -1,5 +1,5 @@
import re
-from uuid import uuid4
+import uuid
from .common import InfoExtractor
from ..compat import compat_str
@@ -53,7 +53,7 @@ class ZattooPlatformBaseIE(InfoExtractor):
self._request_webpage(
'%s/zapi/v3/session/hello' % self._host_url(), None,
'Opening session', data=urlencode_postdata({
- 'uuid': compat_str(uuid4()),
+ 'uuid': compat_str(uuid.uuid4()),
'lang': 'en',
'app_version': '1.8.2',
'format': 'json',
diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py
index acadc01..356712c 100644
--- a/yt_dlp/networking/__init__.py
+++ b/yt_dlp/networking/__init__.py
@@ -28,3 +28,10 @@ except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message())
+
+try:
+ from . import _curlcffi # noqa: F401
+except ImportError:
+ pass
+except Exception as e:
+ warnings.warn(f'Failed to import "curl_cffi" request handler: {e}' + bug_reports_message())
diff --git a/yt_dlp/networking/_curlcffi.py b/yt_dlp/networking/_curlcffi.py
new file mode 100644
index 0000000..39d1f70
--- /dev/null
+++ b/yt_dlp/networking/_curlcffi.py
@@ -0,0 +1,221 @@
+from __future__ import annotations
+
+import io
+import math
+import urllib.parse
+
+from ._helper import InstanceStoreMixin, select_proxy
+from .common import (
+ Features,
+ Request,
+ Response,
+ register_preference,
+ register_rh,
+)
+from .exceptions import (
+ CertificateVerifyError,
+ HTTPError,
+ IncompleteRead,
+ ProxyError,
+ SSLError,
+ TransportError,
+)
+from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
+from ..dependencies import curl_cffi
+from ..utils import int_or_none
+
+if curl_cffi is None:
+ raise ImportError('curl_cffi is not installed')
+
+curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.'))
+
+if curl_cffi_version != (0, 5, 10):
+ curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
+ raise ImportError('Only curl_cffi 0.5.10 is supported')
+
+import curl_cffi.requests
+from curl_cffi.const import CurlECode, CurlOpt
+
+
+class CurlCFFIResponseReader(io.IOBase):
+ def __init__(self, response: curl_cffi.requests.Response):
+ self._response = response
+ self._iterator = response.iter_content()
+ self._buffer = b''
+ self.bytes_read = 0
+
+ def readable(self):
+ return True
+
+ def read(self, size=None):
+ exception_raised = True
+ try:
+ while self._iterator and (size is None or len(self._buffer) < size):
+ chunk = next(self._iterator, None)
+ if chunk is None:
+ self._iterator = None
+ break
+ self._buffer += chunk
+ self.bytes_read += len(chunk)
+
+ if size is None:
+ size = len(self._buffer)
+ data = self._buffer[:size]
+ self._buffer = self._buffer[size:]
+
+ # "free" the curl instance if the response is fully read.
+ # curl_cffi doesn't do this automatically and only allows one open response per thread
+ if not self._iterator and not self._buffer:
+ self.close()
+ exception_raised = False
+ return data
+ finally:
+ if exception_raised:
+ self.close()
+
+ def close(self):
+ if not self.closed:
+ self._response.close()
+ self._buffer = b''
+ super().close()
+
+
+class CurlCFFIResponseAdapter(Response):
+ fp: CurlCFFIResponseReader
+
+ def __init__(self, response: curl_cffi.requests.Response):
+ super().__init__(
+ fp=CurlCFFIResponseReader(response),
+ headers=response.headers,
+ url=response.url,
+ status=response.status_code)
+
+ def read(self, amt=None):
+ try:
+ return self.fp.read(amt)
+ except curl_cffi.requests.errors.RequestsError as e:
+ if e.code == CurlECode.PARTIAL_FILE:
+ content_length = int_or_none(e.response.headers.get('Content-Length'))
+ raise IncompleteRead(
+ partial=self.fp.bytes_read,
+ expected=content_length - self.fp.bytes_read if content_length is not None else None,
+ cause=e) from e
+ raise TransportError(cause=e) from e
+
+
+@register_rh
+class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
+ RH_NAME = 'curl_cffi'
+ _SUPPORTED_URL_SCHEMES = ('http', 'https')
+ _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
+ _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
+ _SUPPORTED_IMPERSONATE_TARGET_MAP = {
+ ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
+ ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
+ ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
+ ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101,
+ ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100,
+ ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
+ ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
+ ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
+ ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
+ ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
+ ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
+ }
+
+ def _create_instance(self, cookiejar=None):
+ return curl_cffi.requests.Session(cookies=cookiejar)
+
+ def _check_extensions(self, extensions):
+ super()._check_extensions(extensions)
+ extensions.pop('impersonate', None)
+ extensions.pop('cookiejar', None)
+ extensions.pop('timeout', None)
+
+ def _send(self, request: Request):
+ max_redirects_exceeded = False
+ session: curl_cffi.requests.Session = self._get_instance(
+ cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None)
+
+ if self.verbose:
+ session.curl.setopt(CurlOpt.VERBOSE, 1)
+
+ proxies = self._get_proxies(request)
+ if 'no' in proxies:
+ session.curl.setopt(CurlOpt.NOPROXY, proxies['no'])
+ proxies.pop('no', None)
+
+ # curl doesn't support per protocol proxies, so we select the one that matches the request protocol
+ proxy = select_proxy(request.url, proxies=proxies)
+ if proxy:
+ session.curl.setopt(CurlOpt.PROXY, proxy)
+ scheme = urllib.parse.urlparse(request.url).scheme.lower()
+ if scheme != 'http':
+ # Enable HTTP CONNECT for HTTPS urls.
+ # Don't use CONNECT for http for compatibility with urllib behaviour.
+ # See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
+ session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
+
+ headers = self._get_impersonate_headers(request)
+
+ if self._client_cert:
+ session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate'])
+ client_certificate_key = self._client_cert.get('client_certificate_key')
+ client_certificate_password = self._client_cert.get('client_certificate_password')
+ if client_certificate_key:
+ session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key)
+ if client_certificate_password:
+ session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password)
+
+ timeout = self._calculate_timeout(request)
+
+ # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
+ # curl_cffi does not currently do this. [2]
+ # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
+ # [1] https://unix.stackexchange.com/a/305311
+ # [2] https://github.com/yifeikong/curl_cffi/issues/156
+ # [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html
+ session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second
+ session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout))
+
+ try:
+ curl_response = session.request(
+ method=request.method,
+ url=request.url,
+ headers=headers,
+ data=request.data,
+ verify=self.verify,
+ max_redirects=5,
+ timeout=timeout,
+ impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
+ self._get_request_target(request)),
+ interface=self.source_address,
+ stream=True
+ )
+ except curl_cffi.requests.errors.RequestsError as e:
+ if e.code == CurlECode.PEER_FAILED_VERIFICATION:
+ raise CertificateVerifyError(cause=e) from e
+
+ elif e.code == CurlECode.SSL_CONNECT_ERROR:
+ raise SSLError(cause=e) from e
+
+ elif e.code == CurlECode.TOO_MANY_REDIRECTS:
+ max_redirects_exceeded = True
+ curl_response = e.response
+
+ elif e.code == CurlECode.PROXY:
+ raise ProxyError(cause=e) from e
+ else:
+ raise TransportError(cause=e) from e
+
+ response = CurlCFFIResponseAdapter(curl_response)
+
+ if not 200 <= response.status < 300:
+ raise HTTPError(response, redirect_loop=max_redirects_exceeded)
+
+ return response
+
+
+@register_preference(CurlCFFIRH)
+def curl_cffi_preference(rh, request):
+ return -100
diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py
index d79dd79..8e678b2 100644
--- a/yt_dlp/networking/_helper.py
+++ b/yt_dlp/networking/_helper.py
@@ -2,6 +2,7 @@ from __future__ import annotations
import contextlib
import functools
+import os
import socket
import ssl
import sys
@@ -121,6 +122,9 @@ def make_ssl_context(
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = verify
context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE
+ # OpenSSL 1.1.1+ Python 3.8+ keylog file
+ if hasattr(context, 'keylog_filename'):
+ context.keylog_filename = os.environ.get('SSLKEYLOGFILE') or None
# Some servers may reject requests if ALPN extension is not sent. See:
# https://github.com/python/cpython/issues/85140
diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py
index 6545028..e3edc77 100644
--- a/yt_dlp/networking/_requests.py
+++ b/yt_dlp/networking/_requests.py
@@ -307,8 +307,7 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
max_redirects_exceeded = False
- session = self._get_instance(
- cookiejar=request.extensions.get('cookiejar') or self.cookiejar)
+ session = self._get_instance(cookiejar=self._get_cookiejar(request))
try:
requests_res = session.request(
@@ -316,8 +315,8 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
url=request.url,
data=request.data,
headers=headers,
- timeout=float(request.extensions.get('timeout') or self.timeout),
- proxies=request.proxies or self.proxies,
+ timeout=self._calculate_timeout(request),
+ proxies=self._get_proxies(request),
allow_redirects=True,
stream=True
)
diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py
index cb4dae3..ff110dc 100644
--- a/yt_dlp/networking/_urllib.py
+++ b/yt_dlp/networking/_urllib.py
@@ -389,11 +389,11 @@ class UrllibRH(RequestHandler, InstanceStoreMixin):
)
opener = self._get_instance(
- proxies=request.proxies or self.proxies,
- cookiejar=request.extensions.get('cookiejar') or self.cookiejar
+ proxies=self._get_proxies(request),
+ cookiejar=self._get_cookiejar(request)
)
try:
- res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
+ res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
except urllib.error.HTTPError as e:
if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
diff --git a/yt_dlp/networking/_websockets.py b/yt_dlp/networking/_websockets.py
index 1597932..6e235b0 100644
--- a/yt_dlp/networking/_websockets.py
+++ b/yt_dlp/networking/_websockets.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import contextlib
import io
import logging
import ssl
@@ -38,27 +39,40 @@ if websockets_version < (12, 0):
import websockets.sync.client
from websockets.uri import parse_uri
+# In websockets Connection, recv_exc and recv_events_exc are defined
+# after the recv events handler thread is started [1].
+# On our CI using PyPy, in some cases a race condition may occur
+# where the recv events handler thread tries to use these attributes before they are defined [2].
+# 1: https://github.com/python-websockets/websockets/blame/de768cf65e7e2b1a3b67854fb9e08816a5ff7050/src/websockets/sync/connection.py#L93
+# 2: "AttributeError: 'ClientConnection' object has no attribute 'recv_events_exc'. Did you mean: 'recv_events'?"
+import websockets.sync.connection # isort: split
+with contextlib.suppress(Exception):
+ # > 12.0
+ websockets.sync.connection.Connection.recv_exc = None
+ # 12.0
+ websockets.sync.connection.Connection.recv_events_exc = None
+
class WebsocketsResponseAdapter(WebSocketResponse):
- def __init__(self, wsw: websockets.sync.client.ClientConnection, url):
+ def __init__(self, ws: websockets.sync.client.ClientConnection, url):
super().__init__(
- fp=io.BytesIO(wsw.response.body or b''),
+ fp=io.BytesIO(ws.response.body or b''),
url=url,
- headers=wsw.response.headers,
- status=wsw.response.status_code,
- reason=wsw.response.reason_phrase,
+ headers=ws.response.headers,
+ status=ws.response.status_code,
+ reason=ws.response.reason_phrase,
)
- self.wsw = wsw
+ self._ws = ws
def close(self):
- self.wsw.close()
+ self._ws.close()
super().close()
def send(self, message):
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.send
try:
- return self.wsw.send(message)
+ return self._ws.send(message)
except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e:
raise TransportError(cause=e) from e
except SocksProxyError as e:
@@ -69,7 +83,7 @@ class WebsocketsResponseAdapter(WebSocketResponse):
def recv(self):
# https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv
try:
- return self.wsw.recv()
+ return self._ws.recv()
except SocksProxyError as e:
raise ProxyError(cause=e) from e
except (websockets.exceptions.WebSocketException, RuntimeError, TimeoutError) as e:
@@ -112,10 +126,10 @@ class WebsocketsRH(WebSocketRequestHandler):
logging.getLogger(name).removeHandler(handler)
def _send(self, request):
- timeout = float(request.extensions.get('timeout') or self.timeout)
+ timeout = self._calculate_timeout(request)
headers = self._merge_headers(request.headers)
if 'cookie' not in headers:
- cookiejar = request.extensions.get('cookiejar') or self.cookiejar
+ cookiejar = self._get_cookiejar(request)
cookie_header = cookiejar.get_cookie_header(request.url)
if cookie_header:
headers['cookie'] = cookie_header
@@ -125,7 +139,7 @@ class WebsocketsRH(WebSocketRequestHandler):
'source_address': (self.source_address, 0) if self.source_address else None,
'timeout': timeout
}
- proxy = select_proxy(request.url, request.proxies or self.proxies or {})
+ proxy = select_proxy(request.url, self._get_proxies(request))
try:
if proxy:
socks_proxy_options = make_socks_proxy_opts(proxy)
diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py
index 39442ba..4c66ba6 100644
--- a/yt_dlp/networking/common.py
+++ b/yt_dlp/networking/common.py
@@ -256,6 +256,15 @@ class RequestHandler(abc.ABC):
def _merge_headers(self, request_headers):
return HTTPHeaderDict(self.headers, request_headers)
+ def _calculate_timeout(self, request):
+ return float(request.extensions.get('timeout') or self.timeout)
+
+ def _get_cookiejar(self, request):
+ return request.extensions.get('cookiejar') or self.cookiejar
+
+ def _get_proxies(self, request):
+ return (request.proxies or self.proxies).copy()
+
def _check_url_scheme(self, request: Request):
scheme = urllib.parse.urlparse(request.url).scheme.lower()
if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
@@ -454,9 +463,10 @@ class Request:
else:
raise TypeError('headers must be a mapping')
- def update(self, url=None, data=None, headers=None, query=None):
+ def update(self, url=None, data=None, headers=None, query=None, extensions=None):
self.data = data if data is not None else self.data
self.headers.update(headers or {})
+ self.extensions.update(extensions or {})
self.url = update_url_query(url or self.url, query or {})
def copy(self):
@@ -491,7 +501,7 @@ class Response(io.IOBase):
def __init__(
self,
- fp: typing.IO,
+ fp: io.IOBase,
url: str,
headers: Mapping[str, str],
status: int = 200,
diff --git a/yt_dlp/networking/impersonate.py b/yt_dlp/networking/impersonate.py
new file mode 100644
index 0000000..ca66180
--- /dev/null
+++ b/yt_dlp/networking/impersonate.py
@@ -0,0 +1,141 @@
+from __future__ import annotations
+
+import re
+from abc import ABC
+from dataclasses import dataclass
+from typing import Any
+
+from .common import RequestHandler, register_preference
+from .exceptions import UnsupportedRequest
+from ..compat.types import NoneType
+from ..utils import classproperty, join_nonempty
+from ..utils.networking import std_headers
+
+
+@dataclass(order=True, frozen=True)
+class ImpersonateTarget:
+ """
+ A target for browser impersonation.
+
+ Parameters:
+ @param client: the client to impersonate
+ @param version: the client version to impersonate
+ @param os: the client OS to impersonate
+ @param os_version: the client OS version to impersonate
+
+ Note: None is used to indicate to match any.
+
+ """
+ client: str | None = None
+ version: str | None = None
+ os: str | None = None
+ os_version: str | None = None
+
+ def __post_init__(self):
+ if self.version and not self.client:
+ raise ValueError('client is required if version is set')
+ if self.os_version and not self.os:
+ raise ValueError('os is required if os_version is set')
+
+ def __contains__(self, target: ImpersonateTarget):
+ if not isinstance(target, ImpersonateTarget):
+ return False
+ return (
+ (self.client is None or target.client is None or self.client == target.client)
+ and (self.version is None or target.version is None or self.version == target.version)
+ and (self.os is None or target.os is None or self.os == target.os)
+ and (self.os_version is None or target.os_version is None or self.os_version == target.os_version)
+ )
+
+ def __str__(self):
+ return f'{join_nonempty(self.client, self.version)}:{join_nonempty(self.os, self.os_version)}'.rstrip(':')
+
+ @classmethod
+ def from_str(cls, target: str):
+ mobj = re.fullmatch(r'(?:(?P<client>[^:-]+)(?:-(?P<version>[^:-]+))?)?(?::(?:(?P<os>[^:-]+)(?:-(?P<os_version>[^:-]+))?)?)?', target)
+ if not mobj:
+ raise ValueError(f'Invalid impersonate target "{target}"')
+ return cls(**mobj.groupdict())
+
+
+class ImpersonateRequestHandler(RequestHandler, ABC):
+ """
+ Base class for request handlers that support browser impersonation.
+
+ This provides a method for checking the validity of the impersonate extension,
+ which can be used in _check_extensions.
+
+ Impersonate targets consist of a client, version, os and os_ver.
+ See the ImpersonateTarget class for more details.
+
+ The following may be defined:
+ - `_SUPPORTED_IMPERSONATE_TARGET_MAP`: a dict mapping supported targets to custom object.
+ Any Request with an impersonate target not in this list will raise an UnsupportedRequest.
+ Set to None to disable this check.
+ Note: Entries are in order of preference
+
+ Parameters:
+ @param impersonate: the default impersonate target to use for requests.
+ Set to None to disable impersonation.
+ """
+ _SUPPORTED_IMPERSONATE_TARGET_MAP: dict[ImpersonateTarget, Any] = {}
+
+ def __init__(self, *, impersonate: ImpersonateTarget = None, **kwargs):
+ super().__init__(**kwargs)
+ self.impersonate = impersonate
+
+ def _check_impersonate_target(self, target: ImpersonateTarget):
+ assert isinstance(target, (ImpersonateTarget, NoneType))
+ if target is None or not self.supported_targets:
+ return
+ if not self.is_supported_target(target):
+ raise UnsupportedRequest(f'Unsupported impersonate target: {target}')
+
+ def _check_extensions(self, extensions):
+ super()._check_extensions(extensions)
+ if 'impersonate' in extensions:
+ self._check_impersonate_target(extensions.get('impersonate'))
+
+ def _validate(self, request):
+ super()._validate(request)
+ self._check_impersonate_target(self.impersonate)
+
+ def _resolve_target(self, target: ImpersonateTarget | None):
+ """Resolve a target to a supported target."""
+ if target is None:
+ return
+ for supported_target in self.supported_targets:
+ if target in supported_target:
+ if self.verbose:
+ self._logger.stdout(
+ f'{self.RH_NAME}: resolved impersonate target {target} to {supported_target}')
+ return supported_target
+
+ @classproperty
+ def supported_targets(self) -> tuple[ImpersonateTarget, ...]:
+ return tuple(self._SUPPORTED_IMPERSONATE_TARGET_MAP.keys())
+
+ def is_supported_target(self, target: ImpersonateTarget):
+ assert isinstance(target, ImpersonateTarget)
+ return self._resolve_target(target) is not None
+
+ def _get_request_target(self, request):
+ """Get the requested target for the request"""
+ return self._resolve_target(request.extensions.get('impersonate') or self.impersonate)
+
+ def _get_impersonate_headers(self, request):
+ headers = self._merge_headers(request.headers)
+ if self._get_request_target(request) is not None:
+ # remove all headers present in std_headers
+ # todo: change this to not depend on std_headers
+ for k, v in std_headers.items():
+ if headers.get(k) == v:
+ headers.pop(k)
+ return headers
+
+
+@register_preference(ImpersonateRequestHandler)
+def impersonate_preference(rh, request):
+ if request.extensions.get('impersonate') or rh.impersonate:
+ return 1000
+ return 0
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index f884727..faa1ee5 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -516,6 +516,18 @@ def create_parser():
help='Client-side IP address to bind to',
)
network.add_option(
+ '--impersonate',
+ metavar='CLIENT[:OS]', dest='impersonate', default=None,
+ help=(
+ 'Client to impersonate for requests. E.g. chrome, chrome-110, chrome:windows-10. '
+ 'Pass --impersonate="" to impersonate any client.'),
+ )
+ network.add_option(
+ '--list-impersonate-targets',
+ dest='list_impersonate_targets', default=False, action='store_true',
+ help='List available clients to impersonate.',
+ )
+ network.add_option(
'-4', '--force-ipv4',
action='store_const', const='0.0.0.0', dest='source_address',
help='Make all connections via IPv4',
@@ -680,6 +692,10 @@ def create_parser():
action='store_true', dest='break_on_existing', default=False,
help='Stop the download process when encountering a file that is in the archive')
selection.add_option(
+ '--no-break-on-existing',
+ action='store_false', dest='break_on_existing',
+ help='Do not stop the download process when encountering a file that is in the archive (default)')
+ selection.add_option(
'--break-on-reject',
action='store_true', dest='break_on_reject', default=False,
help=optparse.SUPPRESS_HELP)
@@ -1243,6 +1259,10 @@ def create_parser():
# TODO: Document the fields inside "progress"
'--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"'))
verbosity.add_option(
+ '--progress-delta',
+ metavar='SECONDS', action='store', dest='progress_delta', type=float, default=0,
+ help='Time between progress output (default: 0)')
+ verbosity.add_option(
'-v', '--verbose',
action='store_true', dest='verbose', default=False,
help='Print various debugging information')
diff --git a/yt_dlp/update.py b/yt_dlp/update.py
index db50cfa..f47cbc5 100644
--- a/yt_dlp/update.py
+++ b/yt_dlp/update.py
@@ -114,7 +114,7 @@ _NON_UPDATEABLE_REASONS = {
**{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release'
for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()},
'source': 'You cannot update when running from source code; Use git to pull the latest changes',
- 'unknown': 'You installed yt-dlp with a package manager or setup.py; Use that to update',
+ 'unknown': 'You installed yt-dlp from a manual build or with a package manager; Use that to update',
'other': 'You are using an unofficial build of yt-dlp; Build the executable again',
}
diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py
index 9efeb6a..e3e80f3 100644
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@@ -5,7 +5,7 @@ import codecs
import collections
import collections.abc
import contextlib
-import datetime
+import datetime as dt
import email.header
import email.utils
import errno
@@ -50,7 +50,6 @@ from ..compat import (
compat_expanduser,
compat_HTMLParseError,
compat_os_name,
- compat_shlex_quote,
)
from ..dependencies import xattr
@@ -836,9 +835,11 @@ class Popen(subprocess.Popen):
if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
if not isinstance(args, str):
- args = ' '.join(compat_shlex_quote(a) for a in args)
+ args = shell_quote(args, shell=True)
shell = False
- args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
+ # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`)
+ env['='] = '"^\n\n"'
+ args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"'
super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
@@ -1150,14 +1151,14 @@ def extract_timezone(date_str):
timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
if timezone is not None:
date_str = date_str[:-len(m.group('tz'))]
- timezone = datetime.timedelta(hours=timezone or 0)
+ timezone = dt.timedelta(hours=timezone or 0)
else:
date_str = date_str[:-len(m.group('tz'))]
if not m.group('sign'):
- timezone = datetime.timedelta()
+ timezone = dt.timedelta()
else:
sign = 1 if m.group('sign') == '+' else -1
- timezone = datetime.timedelta(
+ timezone = dt.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
return timezone, date_str
@@ -1176,8 +1177,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
with contextlib.suppress(ValueError):
date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
- dt = datetime.datetime.strptime(date_str, date_format) - timezone
- return calendar.timegm(dt.timetuple())
+ dt_ = dt.datetime.strptime(date_str, date_format) - timezone
+ return calendar.timegm(dt_.timetuple())
def date_formats(day_first=True):
@@ -1198,12 +1199,12 @@ def unified_strdate(date_str, day_first=True):
for expression in date_formats(day_first):
with contextlib.suppress(ValueError):
- upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
+ upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d')
if upload_date is None:
timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
with contextlib.suppress(ValueError):
- upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+ upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d')
if upload_date is not None:
return str(upload_date)
@@ -1233,8 +1234,8 @@ def unified_timestamp(date_str, day_first=True):
for expression in date_formats(day_first):
with contextlib.suppress(ValueError):
- dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
- return calendar.timegm(dt.timetuple())
+ dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta)
+ return calendar.timegm(dt_.timetuple())
timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
@@ -1272,11 +1273,11 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
if precision == 'auto':
auto_precision = True
precision = 'microsecond'
- today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
+ today = datetime_round(dt.datetime.now(dt.timezone.utc), precision)
if date_str in ('now', 'today'):
return today
if date_str == 'yesterday':
- return today - datetime.timedelta(days=1)
+ return today - dt.timedelta(days=1)
match = re.match(
r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
date_str)
@@ -1291,13 +1292,13 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
if unit == 'week':
unit = 'day'
time *= 7
- delta = datetime.timedelta(**{unit + 's': time})
+ delta = dt.timedelta(**{unit + 's': time})
new_date = start_time + delta
if auto_precision:
return datetime_round(new_date, unit)
return new_date
- return datetime_round(datetime.datetime.strptime(date_str, format), precision)
+ return datetime_round(dt.datetime.strptime(date_str, format), precision)
def date_from_str(date_str, format='%Y%m%d', strict=False):
@@ -1312,21 +1313,21 @@ def date_from_str(date_str, format='%Y%m%d', strict=False):
return datetime_from_str(date_str, precision='microsecond', format=format).date()
-def datetime_add_months(dt, months):
+def datetime_add_months(dt_, months):
"""Increment/Decrement a datetime object by months."""
- month = dt.month + months - 1
- year = dt.year + month // 12
+ month = dt_.month + months - 1
+ year = dt_.year + month // 12
month = month % 12 + 1
- day = min(dt.day, calendar.monthrange(year, month)[1])
- return dt.replace(year, month, day)
+ day = min(dt_.day, calendar.monthrange(year, month)[1])
+ return dt_.replace(year, month, day)
-def datetime_round(dt, precision='day'):
+def datetime_round(dt_, precision='day'):
"""
Round a datetime object's time to a specific precision
"""
if precision == 'microsecond':
- return dt
+ return dt_
unit_seconds = {
'day': 86400,
@@ -1335,8 +1336,8 @@ def datetime_round(dt, precision='day'):
'second': 1,
}
roundto = lambda x, n: ((x + n / 2) // n) * n
- timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
- return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
+ timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision])
+ return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc)
def hyphenate_date(date_str):
@@ -1357,11 +1358,11 @@ class DateRange:
if start is not None:
self.start = date_from_str(start, strict=True)
else:
- self.start = datetime.datetime.min.date()
+ self.start = dt.datetime.min.date()
if end is not None:
self.end = date_from_str(end, strict=True)
else:
- self.end = datetime.datetime.max.date()
+ self.end = dt.datetime.max.date()
if self.start > self.end:
raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
@@ -1372,7 +1373,7 @@ class DateRange:
def __contains__(self, date):
"""Check if the date is in the range"""
- if not isinstance(date, datetime.date):
+ if not isinstance(date, dt.date):
date = date_from_str(date)
return self.start <= date <= self.end
@@ -1637,15 +1638,38 @@ def get_filesystem_encoding():
return encoding if encoding is not None else 'utf-8'
-def shell_quote(args):
- quoted_args = []
- encoding = get_filesystem_encoding()
- for a in args:
- if isinstance(a, bytes):
- # We may get a filename encoded with 'encodeFilename'
- a = a.decode(encoding)
- quoted_args.append(compat_shlex_quote(a))
- return ' '.join(quoted_args)
+_WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'})
+_CMD_QUOTE_TRANS = str.maketrans({
+ # Keep quotes balanced by replacing them with `""` instead of `\\"`
+ '"': '""',
+ # Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`)
+ # `=` should be unique since variables containing `=` cannot be set using cmd
+ '\n': '%=%',
+ # While we are only required to escape backslashes immediately before quotes,
+ # we instead escape all of 'em anyways to be consistent
+ '\\': '\\\\',
+ # Use zero length variable replacement so `%` doesn't get expanded
+ # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`)
+ '%': '%%cd:~,%',
+})
+
+
+def shell_quote(args, *, shell=False):
+ args = list(variadic(args))
+ if any(isinstance(item, bytes) for item in args):
+ deprecation_warning('Passing bytes to utils.shell_quote is deprecated')
+ encoding = get_filesystem_encoding()
+ for index, item in enumerate(args):
+ if isinstance(item, bytes):
+ args[index] = item.decode(encoding)
+
+ if compat_os_name != 'nt':
+ return shlex.join(args)
+
+ trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS
+ return ' '.join(
+ s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""')
+ for s in args)
def smuggle_url(url, data):
@@ -1996,12 +2020,12 @@ def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
if isinstance(timestamp, (int, float)): # unix timestamp
# Using naive datetime here can break timestamp() in Windows
# Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
- # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
+ # Also, dt.datetime.fromtimestamp breaks for negative timestamps
# Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
- datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
- + datetime.timedelta(seconds=timestamp))
+ datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc)
+ + dt.timedelta(seconds=timestamp))
elif isinstance(timestamp, str): # assume YYYYMMDD
- datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
+ datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d')
date_format = re.sub( # Support %s on windows
r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
return datetime_object.strftime(date_format)
@@ -2849,7 +2873,7 @@ def ytdl_is_updateable():
def args_to_str(args):
# Get a short string representation for a subprocess command
- return ' '.join(compat_shlex_quote(a) for a in args)
+ return shell_quote(args)
def error_to_str(err):
@@ -4490,10 +4514,10 @@ def write_xattr(path, key, value):
def random_birthday(year_field, month_field, day_field):
- start_date = datetime.date(1950, 1, 1)
- end_date = datetime.date(1995, 12, 31)
+ start_date = dt.date(1950, 1, 1)
+ end_date = dt.date(1995, 12, 31)
offset = random.randint(0, (end_date - start_date).days)
- random_date = start_date + datetime.timedelta(offset)
+ random_date = start_date + dt.timedelta(offset)
return {
year_field: str(random_date.year),
month_field: str(random_date.month),
@@ -4672,7 +4696,7 @@ def time_seconds(**kwargs):
"""
Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
"""
- return time.time() + datetime.timedelta(**kwargs).total_seconds()
+ return time.time() + dt.timedelta(**kwargs).total_seconds()
# create a JSON Web Signature (jws) with HS256 algorithm
@@ -5415,6 +5439,17 @@ class FormatSorter:
return tuple(self._calculate_field_preference(format, field) for field in self._order)
+def filesize_from_tbr(tbr, duration):
+ """
+ @param tbr: Total bitrate in kbps (1000 bits/sec)
+ @param duration: Duration in seconds
+ @returns Filesize in bytes
+ """
+ if tbr is None or duration is None:
+ return None
+ return int(duration * tbr * (1000 / 8))
+
+
# XXX: Temporary
class _YDLLogger:
def __init__(self, ydl=None):
diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py
index 8938f4c..96eb2ed 100644
--- a/yt_dlp/utils/traversal.py
+++ b/yt_dlp/utils/traversal.py
@@ -1,5 +1,6 @@
import collections.abc
import contextlib
+import http.cookies
import inspect
import itertools
import re
@@ -28,7 +29,8 @@ def traverse_obj(
Each of the provided `paths` is tested and the first producing a valid result will be returned.
The next path will also be tested if the path branched but no results could be found.
- Supported values for traversal are `Mapping`, `Iterable` and `re.Match`.
+ Supported values for traversal are `Mapping`, `Iterable`, `re.Match`,
+ `xml.etree.ElementTree` (xpath) and `http.cookies.Morsel`.
Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
@@ -36,8 +38,8 @@ def traverse_obj(
The keys in the path can be one of:
- `None`: Return the current object.
- `set`: Requires the only item in the set to be a type or function,
- like `{type}`/`{func}`. If a `type`, returns only values
- of this type. If a function, returns `func(obj)`.
+ like `{type}`/`{type, type, ...}/`{func}`. If a `type`, return only
+ values of this type. If a function, returns `func(obj)`.
- `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
- `slice`: Branch out and return all values in `obj[key]`.
- `Ellipsis`: Branch out and return a list of all values.
@@ -48,8 +50,10 @@ def traverse_obj(
For `Iterable`s, `key` is the index of the value.
For `re.Match`es, `key` is the group number (0 = full match)
as well as additionally any group names, if given.
- - `dict` Transform the current object and return a matching dict.
+ - `dict`: Transform the current object and return a matching dict.
Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
+ - `any`-builtin: Take the first matching object and return it, resetting branching.
+ - `all`-builtin: Take all matching objects and return them as a list, resetting branching.
`tuple`, `list`, and `dict` all support nested paths and branches.
@@ -102,10 +106,10 @@ def traverse_obj(
result = obj
elif isinstance(key, set):
- assert len(key) == 1, 'Set should only be used to wrap a single item'
item = next(iter(key))
- if isinstance(item, type):
- if isinstance(obj, item):
+ if len(key) > 1 or isinstance(item, type):
+ assert all(isinstance(item, type) for item in key)
+ if isinstance(obj, tuple(key)):
result = obj
else:
result = try_call(item, args=(obj,))
@@ -117,6 +121,8 @@ def traverse_obj(
elif key is ...:
branching = True
+ if isinstance(obj, http.cookies.Morsel):
+ obj = dict(obj, key=obj.key, value=obj.value)
if isinstance(obj, collections.abc.Mapping):
result = obj.values()
elif is_iterable_like(obj) or isinstance(obj, xml.etree.ElementTree.Element):
@@ -131,6 +137,8 @@ def traverse_obj(
elif callable(key):
branching = True
+ if isinstance(obj, http.cookies.Morsel):
+ obj = dict(obj, key=obj.key, value=obj.value)
if isinstance(obj, collections.abc.Mapping):
iter_obj = obj.items()
elif is_iterable_like(obj) or isinstance(obj, xml.etree.ElementTree.Element):
@@ -157,6 +165,8 @@ def traverse_obj(
} or None
elif isinstance(obj, collections.abc.Mapping):
+ if isinstance(obj, http.cookies.Morsel):
+ obj = dict(obj, key=obj.key, value=obj.value)
result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else
next((v for k, v in obj.items() if casefold(k) == key), None))
@@ -179,7 +189,7 @@ def traverse_obj(
elif isinstance(obj, xml.etree.ElementTree.Element) and isinstance(key, str):
xpath, _, special = key.rpartition('/')
- if not special.startswith('@') and special != 'text()':
+ if not special.startswith('@') and not special.endswith('()'):
xpath = key
special = None
@@ -198,7 +208,7 @@ def traverse_obj(
return try_call(element.attrib.get, args=(special[1:],))
if special == 'text()':
return element.text
- assert False, f'apply_specials is missing case for {special!r}'
+ raise SyntaxError(f'apply_specials is missing case for {special!r}')
if xpath:
result = list(map(apply_specials, obj.iterfind(xpath)))
@@ -228,6 +238,15 @@ def traverse_obj(
if not casesense and isinstance(key, str):
key = key.casefold()
+ if key in (any, all):
+ has_branched = False
+ filtered_objs = (obj for obj in objs if obj not in (None, {}))
+ if key is any:
+ objs = (next(filtered_objs, None),)
+ else:
+ objs = (list(filtered_objs),)
+ continue
+
if __debug__ and callable(key):
# Verify function signature
inspect.signature(key).bind(None, None)
diff --git a/yt_dlp/version.py b/yt_dlp/version.py
index 68c3f00..22c2c04 100644
--- a/yt_dlp/version.py
+++ b/yt_dlp/version.py
@@ -1,8 +1,8 @@
# Autogenerated by devscripts/update-version.py
-__version__ = '2024.03.10'
+__version__ = '2024.04.09'
-RELEASE_GIT_HEAD = '615a84447e8322720be77a0e64298d7f42848693'
+RELEASE_GIT_HEAD = 'ff07792676f404ffff6ee61b5638c9dc1a33a37a'
VARIANT = None
@@ -12,4 +12,4 @@ CHANNEL = 'stable'
ORIGIN = 'yt-dlp/yt-dlp'
-_pkg_version = '2024.03.10'
+_pkg_version = '2024.04.09'