summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-05 09:07:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-05 09:07:34 +0000
commit2b676a78b5a70e28999bf4901184b45858136706 (patch)
treeafa969209076a47b92a87c127b67e299a6e78c76
parentReleasing progress-linux version 2024.07.16-1~progress7.99u1. (diff)
downloadyt-dlp-2b676a78b5a70e28999bf4901184b45858136706.tar.xz
yt-dlp-2b676a78b5a70e28999bf4901184b45858136706.zip
Merging upstream version 2024.07.25.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
-rw-r--r--Changelog.md13
-rw-r--r--pyproject.toml2
-rw-r--r--test/test_youtube_signature.py4
-rw-r--r--yt_dlp/extractor/abematv.py75
-rw-r--r--yt_dlp/extractor/facebook.py24
-rw-r--r--yt_dlp/extractor/mlb.py225
-rw-r--r--yt_dlp/extractor/tiktok.py19
-rw-r--r--yt_dlp/extractor/vimeo.py51
-rw-r--r--yt_dlp/extractor/youtube.py9
-rw-r--r--yt_dlp/version.py6
10 files changed, 313 insertions, 115 deletions
diff --git a/Changelog.md b/Changelog.md
index 194d75e..b2cad7d 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -4,6 +4,19 @@
# To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master
-->
+### 2024.07.25
+
+#### Extractor changes
+- **abematv**: [Adapt key retrieval to request handler framework](https://github.com/yt-dlp/yt-dlp/commit/a3bab4752a2b3d56e5a59b4e0411bb8f695c010b) ([#10491](https://github.com/yt-dlp/yt-dlp/issues/10491)) by [bashonly](https://github.com/bashonly)
+- **facebook**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1a34a802f44a1dab8f642c79c3cc810e21541d3b) ([#10531](https://github.com/yt-dlp/yt-dlp/issues/10531)) by [bashonly](https://github.com/bashonly)
+- **mlbtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f0993391e6052ec8f7aacc286609564f226943b9) ([#10515](https://github.com/yt-dlp/yt-dlp/issues/10515)) by [bashonly](https://github.com/bashonly)
+- **tiktok**: [Fix and deprioritize JSON subtitles](https://github.com/yt-dlp/yt-dlp/commit/2f97779f335ac069ecccd9c7bf81abf4a83cfe7a) ([#10516](https://github.com/yt-dlp/yt-dlp/issues/10516)) by [bashonly](https://github.com/bashonly)
+- **vimeo**: [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a0a1bc3d8d8e3bb9a48a06e835815a0460e90e77) ([#10544](https://github.com/yt-dlp/yt-dlp/issues/10544)) by [bashonly](https://github.com/bashonly)
+- **youtube**: [Fix `n` function name extraction for player `3400486c`](https://github.com/yt-dlp/yt-dlp/commit/713b4cd18f00556771af8cfdd9cea6cc1a09e948) ([#10542](https://github.com/yt-dlp/yt-dlp/issues/10542)) by [bashonly](https://github.com/bashonly)
+
+#### Misc. changes
+- **build**: [Pin `setuptools` version](https://github.com/yt-dlp/yt-dlp/commit/e046db8a116b1c320d4785daadd48ea0b22a3987) ([#10493](https://github.com/yt-dlp/yt-dlp/issues/10493)) by [bashonly](https://github.com/bashonly)
+
### 2024.07.16
#### Core changes
diff --git a/pyproject.toml b/pyproject.toml
index e4c06d2..d5480e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ build = [
"build",
"hatchling",
"pip",
- "setuptools",
+ "setuptools>=71.0.2", # 71.0.0 broke pyinstaller
"wheel",
]
dev = [
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index a14bef5..ae167d1 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -171,6 +171,10 @@ _NSIG_TESTS = [
'https://www.youtube.com/s/player/b22ef6e7/player_ias.vflset/en_US/base.js',
'b6HcntHGkvBLk_FRf', 'kNPW6A7FyP2l8A',
),
+ (
+ 'https://www.youtube.com/s/player/3400486c/player_ias.vflset/en_US/base.js',
+ 'lL46g3XifCKUZn1Xfw', 'z767lhet6V2Skl',
+ ),
]
diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py
index 9471df1..2611c6f 100644
--- a/yt_dlp/extractor/abematv.py
+++ b/yt_dlp/extractor/abematv.py
@@ -9,12 +9,12 @@ import re
import struct
import time
import urllib.parse
-import urllib.request
-import urllib.response
import uuid
from .common import InfoExtractor
from ..aes import aes_ecb_decrypt
+from ..networking import RequestHandler, Response
+from ..networking.exceptions import TransportError
from ..utils import (
ExtractorError,
OnDemandPagedList,
@@ -26,37 +26,36 @@ from ..utils import (
traverse_obj,
update_url_query,
)
-from ..utils.networking import clean_proxies
-
-
-def add_opener(ydl, handler): # FIXME: Create proper API in .networking
- """Add a handler for opening URLs, like _download_webpage"""
- # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
- # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
- rh = ydl._request_director.handlers['Urllib']
- if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
- return
- headers = ydl.params['http_headers'].copy()
- proxies = ydl.proxies.copy()
- clean_proxies(proxies, headers)
- opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
- assert isinstance(opener, urllib.request.OpenerDirector)
- opener.add_handler(handler)
- rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
-
-
-class AbemaLicenseHandler(urllib.request.BaseHandler):
- handler_order = 499
- STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
- HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
-
- def __init__(self, ie: 'AbemaTVIE'):
- # the protocol that this should really handle is 'abematv-license://'
- # abematv_license_open is just a placeholder for development purposes
- # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
- setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open', None))
+
+
+class AbemaLicenseRH(RequestHandler):
+ _SUPPORTED_URL_SCHEMES = ('abematv-license',)
+ _SUPPORTED_PROXY_SCHEMES = None
+ _SUPPORTED_FEATURES = None
+ RH_NAME = 'abematv_license'
+
+ _STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
+ _HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
+
+ def __init__(self, *, ie: 'AbemaTVIE', **kwargs):
+ super().__init__(**kwargs)
self.ie = ie
+ def _send(self, request):
+ url = request.url
+ ticket = urllib.parse.urlparse(url).netloc
+
+ try:
+ response_data = self._get_videokey_from_ticket(ticket)
+ except ExtractorError as e:
+ raise TransportError(cause=e.cause) from e
+ except (IndexError, KeyError, TypeError) as e:
+ raise TransportError(cause=repr(e)) from e
+
+ return Response(
+ io.BytesIO(response_data), url,
+ headers={'Content-Length': str(len(response_data))})
+
def _get_videokey_from_ticket(self, ticket):
to_show = self.ie.get_param('verbose', False)
media_token = self.ie._get_media_token(to_show=to_show)
@@ -72,25 +71,17 @@ class AbemaLicenseHandler(urllib.request.BaseHandler):
'Content-Type': 'application/json',
})
- res = decode_base_n(license_response['k'], table=self.STRTABLE)
+ res = decode_base_n(license_response['k'], table=self._STRTABLE)
encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
h = hmac.new(
- binascii.unhexlify(self.HKEY),
+ binascii.unhexlify(self._HKEY),
(license_response['cid'] + self.ie._DEVICE_ID).encode(),
digestmod=hashlib.sha256)
enckey = bytes_to_intlist(h.digest())
return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
- def abematv_license_open(self, url):
- url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
- ticket = urllib.parse.urlparse(url).netloc
- response_data = self._get_videokey_from_ticket(ticket)
- return urllib.response.addinfourl(io.BytesIO(response_data), headers={
- 'Content-Length': str(len(response_data)),
- }, url=url, code=200)
-
class AbemaTVBaseIE(InfoExtractor):
_NETRC_MACHINE = 'abematv'
@@ -139,7 +130,7 @@ class AbemaTVBaseIE(InfoExtractor):
if self._USERTOKEN:
return self._USERTOKEN
- add_opener(self._downloader, AbemaLicenseHandler(self))
+ self._downloader._request_director.add_handler(AbemaLicenseRH(ie=self, logger=None))
username, _ = self._get_login_info()
auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19')
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index a3ca291..6aba477 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -571,16 +571,21 @@ class FacebookIE(InfoExtractor):
# Formats larger than ~500MB will return error 403 unless chunk size is regulated
f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
- def extract_relay_data(_filter):
- return self._parse_json(self._search_regex(
- rf'data-sjs>({{.*?{_filter}.*?}})</script>',
- webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
+ def yield_all_relay_data(_filter):
+ for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})</script>', webpage):
+ yield self._parse_json(relay_data, video_id, fatal=False) or {}
- def extract_relay_prefetched_data(_filter):
- return traverse_obj(extract_relay_data(_filter), (
- 'require', (None, (..., ..., ..., '__bbox', 'require')),
+ def extract_relay_data(_filter):
+ return next(filter(None, yield_all_relay_data(_filter)), {})
+
+ def extract_relay_prefetched_data(_filter, target_keys=None):
+ path = 'data'
+ if target_keys is not None:
+ path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys))
+ return traverse_obj(yield_all_relay_data(_filter), (
+ ..., 'require', (None, (..., ..., ..., '__bbox', 'require')),
lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
- ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
+ ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {}
if not video_data:
server_js_data = self._parse_json(self._search_regex([
@@ -591,7 +596,8 @@ class FacebookIE(InfoExtractor):
if not video_data:
data = extract_relay_prefetched_data(
- r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
+ r'"(?:dash_manifest|playable_url(?:_quality_hd)?)',
+ target_keys=('video', 'event', 'nodes', 'node', 'mediaset'))
if data:
entries = []
diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py
index 6f67602..230c218 100644
--- a/yt_dlp/extractor/mlb.py
+++ b/yt_dlp/extractor/mlb.py
@@ -1,16 +1,21 @@
+import json
import re
-import urllib.parse
+import time
import uuid
from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
from ..utils import (
+ ExtractorError,
determine_ext,
int_or_none,
join_nonempty,
+ jwt_decode_hs256,
parse_duration,
parse_iso8601,
try_get,
url_or_none,
+ urlencode_postdata,
)
from ..utils.traversal import traverse_obj
@@ -276,81 +281,213 @@ class MLBVideoIE(MLBBaseIE):
class MLBTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})'
_NETRC_MACHINE = 'mlb'
-
_TESTS = [{
'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638',
'info_dict': {
'id': '661581',
'ext': 'mp4',
'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies',
+ 'release_date': '20220702',
+ 'release_timestamp': 1656792300,
},
'params': {
'skip_download': True,
},
}]
+ _GRAPHQL_INIT_QUERY = '''\
+mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) {
+ initSession(device: $device, clientType: $clientType, experience: $experience) {
+ deviceId
+ sessionId
+ entitlements {
+ code
+ }
+ location {
+ countryCode
+ regionName
+ zipCode
+ latitude
+ longitude
+ }
+ clientExperience
+ features
+ }
+ }'''
+ _GRAPHQL_PLAYBACK_QUERY = '''\
+mutation initPlaybackSession(
+ $adCapabilities: [AdExperienceType]
+ $mediaId: String!
+ $deviceId: String!
+ $sessionId: String!
+ $quality: PlaybackQuality
+ ) {
+ initPlaybackSession(
+ adCapabilities: $adCapabilities
+ mediaId: $mediaId
+ deviceId: $deviceId
+ sessionId: $sessionId
+ quality: $quality
+ ) {
+ playbackSessionId
+ playback {
+ url
+ token
+ expiration
+ cdn
+ }
+ }
+ }'''
+ _APP_VERSION = '7.8.2'
+ _device_id = None
+ _session_id = None
_access_token = None
+ _token_expiry = 0
+
+ @property
+ def _api_headers(self):
+ if (self._token_expiry - 120) <= time.time():
+ self.write_debug('Access token has expired; re-logging in')
+ self._perform_login(*self._get_login_info())
+ return {'Authorization': f'Bearer {self._access_token}'}
def _real_initialize(self):
if not self._access_token:
self.raise_login_required(
'All videos are only available to registered users', method='password')
- def _perform_login(self, username, password):
- data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356'
- access_token = self._download_json(
- 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
- headers={
- 'User-Agent': 'okhttp/3.12.1',
- 'Content-Type': 'application/x-www-form-urlencoded',
- }, data=data.encode())['access_token']
+ def _set_device_id(self, username):
+ if not self._device_id:
+ self._device_id = self.cache.load(
+ self._NETRC_MACHINE, 'device_ids', default={}).get(username)
+ if self._device_id:
+ return
+ self._device_id = str(uuid.uuid4())
+ self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id})
- entitlement = self._download_webpage(
- f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={uuid.uuid4()}', None,
- headers={
- 'User-Agent': 'okhttp/3.12.1',
- 'Authorization': f'Bearer {access_token}',
- })
+ def _perform_login(self, username, password):
+ try:
+ self._access_token = self._download_json(
+ 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
+ 'Logging in', 'Unable to log in', headers={
+ 'User-Agent': 'okhttp/3.12.1',
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ }, data=urlencode_postdata({
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ 'scope': 'openid offline_access',
+ 'client_id': '0oa3e1nutA1HLzAKG356',
+ }))['access_token']
+ except ExtractorError as error:
+ if isinstance(error.cause, HTTPError) and error.cause.status == 400:
+ raise ExtractorError('Invalid username or password', expected=True)
+ raise
+
+ self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0
+ self._set_device_id(username)
+
+ self._session_id = self._call_api({
+ 'operationName': 'initSession',
+ 'query': self._GRAPHQL_INIT_QUERY,
+ 'variables': {
+ 'device': {
+ 'appVersion': self._APP_VERSION,
+ 'deviceFamily': 'desktop',
+ 'knownDeviceId': self._device_id,
+ 'languagePreference': 'ENGLISH',
+ 'manufacturer': '',
+ 'model': '',
+ 'os': '',
+ 'osVersion': '',
+ },
+ 'clientType': 'WEB',
+ },
+ }, None, 'session ID')['data']['initSession']['sessionId']
- data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv'
- self._access_token = self._download_json(
- 'https://us.edge.bamgrid.com/token', None,
+ def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True):
+ return self._download_json(
+ 'https://media-gateway.mlb.com/graphql', video_id,
+ f'Downloading {description}', f'Unable to download {description}', fatal=fatal,
headers={
+ **self._api_headers,
'Accept': 'application/json',
- 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk',
- 'Content-Type': 'application/x-www-form-urlencoded',
- }, data=data.encode())['access_token']
+ 'Content-Type': 'application/json',
+ 'x-client-name': 'WEB',
+ 'x-client-version': self._APP_VERSION,
+ }, data=json.dumps(data, separators=(',', ':')).encode())
+
+ def _extract_formats_and_subtitles(self, broadcast, video_id):
+ feed = traverse_obj(broadcast, ('homeAway', {str.title}))
+ medium = traverse_obj(broadcast, ('type', {str}))
+ language = traverse_obj(broadcast, ('language', {str.lower}))
+ format_id = join_nonempty(feed, medium, language)
+
+ response = self._call_api({
+ 'operationName': 'initPlaybackSession',
+ 'query': self._GRAPHQL_PLAYBACK_QUERY,
+ 'variables': {
+ 'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'],
+ 'deviceId': self._device_id,
+ 'mediaId': broadcast['mediaId'],
+ 'quality': 'PLACEHOLDER',
+ 'sessionId': self._session_id,
+ },
+ }, video_id, f'{format_id} broadcast JSON', fatal=False)
+
+ playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict}))
+ m3u8_url = traverse_obj(playback, ('url', {url_or_none}))
+ token = traverse_obj(playback, ('token', {str}))
+
+ if not (m3u8_url and token):
+ errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str})))
+ if 'not entitled' in errors:
+ raise ExtractorError(errors, expected=True)
+ elif errors: # Only warn when 'blacked out' since radio formats are available
+ self.report_warning(f'API returned errors for {format_id}: {errors}')
+ else:
+ self.report_warning(f'No formats available for {format_id} broadcast; skipping')
+ return [], {}
+
+ cdn_headers = {'x-cdn-token': token}
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4',
+ m3u8_id=format_id, fatal=False, headers=cdn_headers)
+ for fmt in fmts:
+ fmt['http_headers'] = cdn_headers
+ fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' '))
+ fmt.setdefault('language', language)
+ if fmt.get('vcodec') == 'none' and fmt['language'] == 'en':
+ fmt['source_preference'] = 10
+
+ return fmts, subs
def _real_extract(self, url):
video_id = self._match_id(url)
- airings = self._download_json(
- f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D',
- video_id)['data']['Airings']
+ metadata = traverse_obj(self._download_json(
+ 'https://statsapi.mlb.com/api/v1/schedule', video_id, query={
+ 'gamePk': video_id,
+ 'hydrate': 'broadcasts(all),statusFlags',
+ }), ('dates', ..., 'games', lambda _, v: str(v['gamePk']) == video_id and v['broadcasts'], any))
+
+ broadcasts = traverse_obj(metadata, (
+ 'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF'))
formats, subtitles = [], {}
- for airing in traverse_obj(airings, lambda _, v: v['playbackUrls'][0]['href']):
- format_id = join_nonempty('feedType', 'feedLanguage', from_dict=airing)
- m3u8_url = traverse_obj(self._download_json(
- airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id,
- note=f'Downloading {format_id} stream info JSON',
- errnote=f'Failed to download {format_id} stream info, skipping',
- fatal=False, headers={
- 'Authorization': self._access_token,
- 'Accept': 'application/vnd.media-service+json; version=2',
- }), ('stream', 'complete', {url_or_none}))
- if not m3u8_url:
- continue
- f, s = self._extract_m3u8_formats_and_subtitles(
- m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
- formats.extend(f)
- self._merge_subtitles(s, target=subtitles)
+ for broadcast in broadcasts:
+ fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
- 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False),
- 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE',
+ 'title': join_nonempty(
+ traverse_obj(metadata, ('officialDate', {str})),
+ traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})),
+ delim=' - '),
+ 'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON',
+ 'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})),
'formats': formats,
'subtitles': subtitles,
- 'http_headers': {'Authorization': f'Bearer {self._access_token}'},
}
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
index aa1dcec..9d823a3 100644
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -23,7 +23,6 @@ from ..utils import (
mimetype2ext,
parse_qs,
qualities,
- remove_start,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
@@ -254,7 +253,16 @@ class TikTokBaseIE(InfoExtractor):
def _get_subtitles(self, aweme_detail, aweme_id, user_name):
# TODO: Extract text positioning info
+
+ EXT_MAP = { # From lowest to highest preference
+ 'creator_caption': 'json',
+ 'srt': 'srt',
+ 'webvtt': 'vtt',
+ }
+ preference = qualities(tuple(EXT_MAP.values()))
+
subtitles = {}
+
# aweme/detail endpoint subs
captions_info = traverse_obj(
aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
@@ -278,8 +286,8 @@ class TikTokBaseIE(InfoExtractor):
if not caption.get('url'):
continue
subtitles.setdefault(caption.get('lang') or 'en', []).append({
- 'ext': remove_start(caption.get('caption_format'), 'web'),
'url': caption['url'],
+ 'ext': EXT_MAP.get(caption.get('Format')),
})
# webpage subs
if not subtitles:
@@ -288,9 +296,14 @@ class TikTokBaseIE(InfoExtractor):
self._create_url(user_name, aweme_id), aweme_id, fatal=False)
for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
- 'ext': remove_start(caption.get('Format'), 'web'),
'url': caption['Url'],
+ 'ext': EXT_MAP.get(caption.get('Format')),
})
+
+ # Deprioritize creator_caption json since it can't be embedded or used by media players
+ for lang, subs_list in subtitles.items():
+ subtitles[lang] = sorted(subs_list, key=lambda x: preference(x['ext']))
+
return subtitles
def _parse_url_key(self, url_key):
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 18eb084..d10689c 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -212,16 +212,6 @@ class VimeoBaseInfoExtractor(InfoExtractor):
owner = video_data.get('owner') or {}
video_uploader_url = owner.get('url')
- duration = int_or_none(video_data.get('duration'))
- chapter_data = try_get(config, lambda x: x['embed']['chapters']) or []
- chapters = [{
- 'title': current_chapter.get('title'),
- 'start_time': current_chapter.get('timecode'),
- 'end_time': next_chapter.get('timecode'),
- } for current_chapter, next_chapter in zip(chapter_data, chapter_data[1:] + [{'timecode': duration}])]
- if chapters and chapters[0]['start_time']: # Chapters may not start from 0
- chapters[:0] = [{'title': '<Untitled>', 'start_time': 0, 'end_time': chapters[0]['start_time']}]
-
return {
'id': str_or_none(video_data.get('id')) or video_id,
'title': video_title,
@@ -229,8 +219,12 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None,
'uploader_url': video_uploader_url,
'thumbnails': thumbnails,
- 'duration': duration,
- 'chapters': chapters or None,
+ 'duration': int_or_none(video_data.get('duration')),
+ 'chapters': sorted(traverse_obj(config, (
+ 'embed', 'chapters', lambda _, v: int(v['timecode']) is not None, {
+ 'title': ('title', {str}),
+ 'start_time': ('timecode', {int_or_none}),
+ })), key=lambda c: c['start_time']) or None,
'formats': formats,
'subtitles': subtitles,
'live_status': live_status,
@@ -709,6 +703,39 @@ class VimeoIE(VimeoBaseInfoExtractor):
},
},
{
+ # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308
+ 'url': 'https://player.vimeo.com/video/756714419',
+ 'info_dict': {
+ 'id': '756714419',
+ 'ext': 'mp4',
+ 'title': 'Dr Arielle Schwartz - Therapeutic yoga for optimum sleep',
+ 'uploader': 'Alex Howard',
+ 'uploader_id': 'user54729178',
+ 'uploader_url': 'https://vimeo.com/user54729178',
+ 'thumbnail': r're:https://i\.vimeocdn\.com/video/1520099929-[\da-f]+-d_1280',
+ 'duration': 2636,
+ 'chapters': [
+ {'start_time': 0, 'end_time': 10, 'title': '<Untitled Chapter 1>'},
+ {'start_time': 10, 'end_time': 106, 'title': 'Welcoming Dr Arielle Schwartz'},
+ {'start_time': 106, 'end_time': 305, 'title': 'What is therapeutic yoga?'},
+ {'start_time': 305, 'end_time': 594, 'title': 'Vagal toning practices'},
+ {'start_time': 594, 'end_time': 888, 'title': 'Trauma and difficulty letting go'},
+ {'start_time': 888, 'end_time': 1059, 'title': "Dr Schwartz' insomnia experience"},
+ {'start_time': 1059, 'end_time': 1471, 'title': 'A strategy for helping sleep issues'},
+ {'start_time': 1471, 'end_time': 1667, 'title': 'Yoga nidra'},
+ {'start_time': 1667, 'end_time': 2121, 'title': 'Wisdom in stillness'},
+ {'start_time': 2121, 'end_time': 2386, 'title': 'What helps us be more able to let go?'},
+ {'start_time': 2386, 'end_time': 2510, 'title': 'Practical tips to help ourselves'},
+ {'start_time': 2510, 'end_time': 2636, 'title': 'Where to find out more'},
+ ],
+ },
+ 'params': {
+ 'http_headers': {'Referer': 'https://sleepsuperconference.com'},
+ 'skip_download': 'm3u8',
+ },
+ 'expected_warnings': ['Failed to parse XML: not well-formed'],
+ },
+ {
# user playlist alias -> https://vimeo.com/258705797
'url': 'https://vimeo.com/user26785108/newspiritualguide',
'only_matching': True,
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 53aca38..7364e8a 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -3131,7 +3131,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_n_function_name(self, jscode):
funcname, idx = self._search_regex(
- r'''(?x)(?:\.get\("n"\)\)&&\(b=|b=String\.fromCharCode\(110\),c=a\.get\(b\)\)&&\(c=)
+ r'''(?x)
+ (?:
+ \.get\("n"\)\)&&\(b=|
+ (?:
+ b=String\.fromCharCode\(110\)|
+ ([a-zA-Z0-9$.]+)&&\(b="nn"\[\+\1\]
+ ),c=a\.get\(b\)\)&&\(c=
+ )
(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)''',
jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
if not idx:
diff --git a/yt_dlp/version.py b/yt_dlp/version.py
index db5b342..e641bf5 100644
--- a/yt_dlp/version.py
+++ b/yt_dlp/version.py
@@ -1,8 +1,8 @@
# Autogenerated by devscripts/update-version.py
-__version__ = '2024.07.16'
+__version__ = '2024.07.25'
-RELEASE_GIT_HEAD = '89a161e8c62569a662deda1c948664152efcb6b4'
+RELEASE_GIT_HEAD = 'f0993391e6052ec8f7aacc286609564f226943b9'
VARIANT = None
@@ -12,4 +12,4 @@ CHANNEL = 'stable'
ORIGIN = 'yt-dlp/yt-dlp'
-_pkg_version = '2024.07.16'
+_pkg_version = '2024.07.25'