Merging upstream version 2024.07.25.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-08-05 09:07:34 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-08-05 09:07:34 +0000
commit: 2b676a78b5a70e28999bf4901184b45858136706 (patch)
tree: afa969209076a47b92a87c127b67e299a6e78c76
parent: Releasing progress-linux version 2024.07.16-1~progress7.99u1. (diff)
download: yt-dlp-2b676a78b5a70e28999bf4901184b45858136706.tar.xz
yt-dlp-2b676a78b5a70e28999bf4901184b45858136706.zip
10 files changed, 313 insertions, 115 deletions
diff --git a/Changelog.md b/Changelog.md
index 194d75e..b2cad7d 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -4,6 +4,19 @@
 # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master
 -->
 
+### 2024.07.25
+
+#### Extractor changes
+- **abematv**: [Adapt key retrieval to request handler framework](https://github.com/yt-dlp/yt-dlp/commit/a3bab4752a2b3d56e5a59b4e0411bb8f695c010b) ([#10491](https://github.com/yt-dlp/yt-dlp/issues/10491)) by [bashonly](https://github.com/bashonly)
+- **facebook**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1a34a802f44a1dab8f642c79c3cc810e21541d3b) ([#10531](https://github.com/yt-dlp/yt-dlp/issues/10531)) by [bashonly](https://github.com/bashonly)
+- **mlbtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f0993391e6052ec8f7aacc286609564f226943b9) ([#10515](https://github.com/yt-dlp/yt-dlp/issues/10515)) by [bashonly](https://github.com/bashonly)
+- **tiktok**: [Fix and deprioritize JSON subtitles](https://github.com/yt-dlp/yt-dlp/commit/2f97779f335ac069ecccd9c7bf81abf4a83cfe7a) ([#10516](https://github.com/yt-dlp/yt-dlp/issues/10516)) by [bashonly](https://github.com/bashonly)
+- **vimeo**: [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a0a1bc3d8d8e3bb9a48a06e835815a0460e90e77) ([#10544](https://github.com/yt-dlp/yt-dlp/issues/10544)) by [bashonly](https://github.com/bashonly)
+- **youtube**: [Fix `n` function name extraction for player `3400486c`](https://github.com/yt-dlp/yt-dlp/commit/713b4cd18f00556771af8cfdd9cea6cc1a09e948) ([#10542](https://github.com/yt-dlp/yt-dlp/issues/10542)) by [bashonly](https://github.com/bashonly)
+
+#### Misc. changes
+- **build**: [Pin `setuptools` version](https://github.com/yt-dlp/yt-dlp/commit/e046db8a116b1c320d4785daadd48ea0b22a3987) ([#10493](https://github.com/yt-dlp/yt-dlp/issues/10493)) by [bashonly](https://github.com/bashonly)
+
 ### 2024.07.16
 
 #### Core changes
diff --git a/pyproject.toml b/pyproject.toml
index e4c06d2..d5480e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ build = [
     "build",
     "hatchling",
     "pip",
-    "setuptools",
+    "setuptools>=71.0.2",  # 71.0.0 broke pyinstaller
     "wheel",
 ]
 dev = [
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index a14bef5..ae167d1 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -171,6 +171,10 @@ _NSIG_TESTS = [
         'https://www.youtube.com/s/player/b22ef6e7/player_ias.vflset/en_US/base.js',
         'b6HcntHGkvBLk_FRf', 'kNPW6A7FyP2l8A',
     ),
+    (
+        'https://www.youtube.com/s/player/3400486c/player_ias.vflset/en_US/base.js',
+        'lL46g3XifCKUZn1Xfw', 'z767lhet6V2Skl',
+    ),
 ]
 
 
diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py
index 9471df1..2611c6f 100644
--- a/yt_dlp/extractor/abematv.py
+++ b/yt_dlp/extractor/abematv.py
@@ -9,12 +9,12 @@ import re
 import struct
 import time
 import urllib.parse
-import urllib.request
-import urllib.response
 import uuid
 
 from .common import InfoExtractor
 from ..aes import aes_ecb_decrypt
+from ..networking import RequestHandler, Response
+from ..networking.exceptions import TransportError
 from ..utils import (
     ExtractorError,
     OnDemandPagedList,
@@ -26,37 +26,36 @@ from ..utils import (
     traverse_obj,
     update_url_query,
 )
-from ..utils.networking import clean_proxies
-
-
-def add_opener(ydl, handler):  # FIXME: Create proper API in .networking
-    """Add a handler for opening URLs, like _download_webpage"""
-    # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426
-    # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605
-    rh = ydl._request_director.handlers['Urllib']
-    if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES:
-        return
-    headers = ydl.params['http_headers'].copy()
-    proxies = ydl.proxies.copy()
-    clean_proxies(proxies, headers)
-    opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies)
-    assert isinstance(opener, urllib.request.OpenerDirector)
-    opener.add_handler(handler)
-    rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license')
-
-
-class AbemaLicenseHandler(urllib.request.BaseHandler):
-    handler_order = 499
-    STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
-    HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
-
-    def __init__(self, ie: 'AbemaTVIE'):
-        # the protocol that this should really handle is 'abematv-license://'
-        # abematv_license_open is just a placeholder for development purposes
-        # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510
-        setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open', None))
+
+
+class AbemaLicenseRH(RequestHandler):
+    _SUPPORTED_URL_SCHEMES = ('abematv-license',)
+    _SUPPORTED_PROXY_SCHEMES = None
+    _SUPPORTED_FEATURES = None
+    RH_NAME = 'abematv_license'
+
+    _STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'
+    _HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E'
+
+    def __init__(self, *, ie: 'AbemaTVIE', **kwargs):
+        super().__init__(**kwargs)
         self.ie = ie
 
+    def _send(self, request):
+        url = request.url
+        ticket = urllib.parse.urlparse(url).netloc
+
+        try:
+            response_data = self._get_videokey_from_ticket(ticket)
+        except ExtractorError as e:
+            raise TransportError(cause=e.cause) from e
+        except (IndexError, KeyError, TypeError) as e:
+            raise TransportError(cause=repr(e)) from e
+
+        return Response(
+            io.BytesIO(response_data), url,
+            headers={'Content-Length': str(len(response_data))})
+
     def _get_videokey_from_ticket(self, ticket):
         to_show = self.ie.get_param('verbose', False)
         media_token = self.ie._get_media_token(to_show=to_show)
@@ -72,25 +71,17 @@ class AbemaLicenseHandler(urllib.request.BaseHandler):
                 'Content-Type': 'application/json',
             })
 
-        res = decode_base_n(license_response['k'], table=self.STRTABLE)
+        res = decode_base_n(license_response['k'], table=self._STRTABLE)
         encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff))
 
         h = hmac.new(
-            binascii.unhexlify(self.HKEY),
+            binascii.unhexlify(self._HKEY),
             (license_response['cid'] + self.ie._DEVICE_ID).encode(),
             digestmod=hashlib.sha256)
         enckey = bytes_to_intlist(h.digest())
 
         return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey))
 
-    def abematv_license_open(self, url):
-        url = url.get_full_url() if isinstance(url, urllib.request.Request) else url
-        ticket = urllib.parse.urlparse(url).netloc
-        response_data = self._get_videokey_from_ticket(ticket)
-        return urllib.response.addinfourl(io.BytesIO(response_data), headers={
-            'Content-Length': str(len(response_data)),
-        }, url=url, code=200)
-
 
 class AbemaTVBaseIE(InfoExtractor):
     _NETRC_MACHINE = 'abematv'
@@ -139,7 +130,7 @@ class AbemaTVBaseIE(InfoExtractor):
         if self._USERTOKEN:
             return self._USERTOKEN
 
-        add_opener(self._downloader, AbemaLicenseHandler(self))
+        self._downloader._request_director.add_handler(AbemaLicenseRH(ie=self, logger=None))
 
         username, _ = self._get_login_info()
         auth_cache = username and self.cache.load(self._NETRC_MACHINE, username, min_ver='2024.01.19')
diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py
index a3ca291..6aba477 100644
--- a/yt_dlp/extractor/facebook.py
+++ b/yt_dlp/extractor/facebook.py
@@ -571,16 +571,21 @@ class FacebookIE(InfoExtractor):
                 # Formats larger than ~500MB will return error 403 unless chunk size is regulated
                 f.setdefault('downloader_options', {})['http_chunk_size'] = 250 << 20
 
-        def extract_relay_data(_filter):
-            return self._parse_json(self._search_regex(
-                rf'data-sjs>({{.*?{_filter}.*?}})</script>',
-                webpage, 'replay data', default='{}'), video_id, fatal=False) or {}
+        def yield_all_relay_data(_filter):
+            for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})</script>', webpage):
+                yield self._parse_json(relay_data, video_id, fatal=False) or {}
 
-        def extract_relay_prefetched_data(_filter):
-            return traverse_obj(extract_relay_data(_filter), (
-                'require', (None, (..., ..., ..., '__bbox', 'require')),
+        def extract_relay_data(_filter):
+            return next(filter(None, yield_all_relay_data(_filter)), {})
+
+        def extract_relay_prefetched_data(_filter, target_keys=None):
+            path = 'data'
+            if target_keys is not None:
+                path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys))
+            return traverse_obj(yield_all_relay_data(_filter), (
+                ..., 'require', (None, (..., ..., ..., '__bbox', 'require')),
                 lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v),
-                ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {}
+                ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {}
 
         if not video_data:
             server_js_data = self._parse_json(self._search_regex([
@@ -591,7 +596,8 @@ class FacebookIE(InfoExtractor):
 
         if not video_data:
             data = extract_relay_prefetched_data(
-                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)')
+                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)',
+                target_keys=('video', 'event', 'nodes', 'node', 'mediaset'))
             if data:
                 entries = []
 
diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py
index 6f67602..230c218 100644
--- a/yt_dlp/extractor/mlb.py
+++ b/yt_dlp/extractor/mlb.py
@@ -1,16 +1,21 @@
+import json
 import re
-import urllib.parse
+import time
 import uuid
 
 from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
 from ..utils import (
+    ExtractorError,
     determine_ext,
     int_or_none,
     join_nonempty,
+    jwt_decode_hs256,
     parse_duration,
     parse_iso8601,
     try_get,
     url_or_none,
+    urlencode_postdata,
 )
 from ..utils.traversal import traverse_obj
 
@@ -276,81 +281,213 @@ class MLBVideoIE(MLBBaseIE):
 class MLBTVIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})'
     _NETRC_MACHINE = 'mlb'
-
     _TESTS = [{
         'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638',
         'info_dict': {
             'id': '661581',
             'ext': 'mp4',
             'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies',
+            'release_date': '20220702',
+            'release_timestamp': 1656792300,
         },
         'params': {
             'skip_download': True,
         },
     }]
+    _GRAPHQL_INIT_QUERY = '''\
+mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) {
+    initSession(device: $device, clientType: $clientType, experience: $experience) {
+        deviceId
+        sessionId
+        entitlements {
+            code
+        }
+        location {
+            countryCode
+            regionName
+            zipCode
+            latitude
+            longitude
+        }
+        clientExperience
+        features
+    }
+  }'''
+    _GRAPHQL_PLAYBACK_QUERY = '''\
+mutation initPlaybackSession(
+        $adCapabilities: [AdExperienceType]
+        $mediaId: String!
+        $deviceId: String!
+        $sessionId: String!
+        $quality: PlaybackQuality
+    ) {
+        initPlaybackSession(
+            adCapabilities: $adCapabilities
+            mediaId: $mediaId
+            deviceId: $deviceId
+            sessionId: $sessionId
+            quality: $quality
+        ) {
+            playbackSessionId
+            playback {
+                url
+                token
+                expiration
+                cdn
+            }
+        }
+    }'''
+    _APP_VERSION = '7.8.2'
+    _device_id = None
+    _session_id = None
     _access_token = None
+    _token_expiry = 0
+
+    @property
+    def _api_headers(self):
+        if (self._token_expiry - 120) <= time.time():
+            self.write_debug('Access token has expired; re-logging in')
+            self._perform_login(*self._get_login_info())
+        return {'Authorization': f'Bearer {self._access_token}'}
 
     def _real_initialize(self):
         if not self._access_token:
             self.raise_login_required(
                 'All videos are only available to registered users', method='password')
 
-    def _perform_login(self, username, password):
-        data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356'
-        access_token = self._download_json(
-            'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
-            headers={
-                'User-Agent': 'okhttp/3.12.1',
-                'Content-Type': 'application/x-www-form-urlencoded',
-            }, data=data.encode())['access_token']
+    def _set_device_id(self, username):
+        if not self._device_id:
+            self._device_id = self.cache.load(
+                self._NETRC_MACHINE, 'device_ids', default={}).get(username)
+        if self._device_id:
+            return
+        self._device_id = str(uuid.uuid4())
+        self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id})
 
-        entitlement = self._download_webpage(
-            f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={uuid.uuid4()}', None,
-            headers={
-                'User-Agent': 'okhttp/3.12.1',
-                'Authorization': f'Bearer {access_token}',
-            })
+    def _perform_login(self, username, password):
+        try:
+            self._access_token = self._download_json(
+                'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
+                'Logging in', 'Unable to log in', headers={
+                    'User-Agent': 'okhttp/3.12.1',
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                }, data=urlencode_postdata({
+                    'grant_type': 'password',
+                    'username': username,
+                    'password': password,
+                    'scope': 'openid offline_access',
+                    'client_id': '0oa3e1nutA1HLzAKG356',
+                }))['access_token']
+        except ExtractorError as error:
+            if isinstance(error.cause, HTTPError) and error.cause.status == 400:
+                raise ExtractorError('Invalid username or password', expected=True)
+            raise
+
+        self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0
+        self._set_device_id(username)
+
+        self._session_id = self._call_api({
+            'operationName': 'initSession',
+            'query': self._GRAPHQL_INIT_QUERY,
+            'variables': {
+                'device': {
+                    'appVersion': self._APP_VERSION,
+                    'deviceFamily': 'desktop',
+                    'knownDeviceId': self._device_id,
+                    'languagePreference': 'ENGLISH',
+                    'manufacturer': '',
+                    'model': '',
+                    'os': '',
+                    'osVersion': '',
+                },
+                'clientType': 'WEB',
+            },
+        }, None, 'session ID')['data']['initSession']['sessionId']
 
-        data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv'
-        self._access_token = self._download_json(
-            'https://us.edge.bamgrid.com/token', None,
+    def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True):
+        return self._download_json(
+            'https://media-gateway.mlb.com/graphql', video_id,
+            f'Downloading {description}', f'Unable to download {description}', fatal=fatal,
             headers={
+                **self._api_headers,
                 'Accept': 'application/json',
-                'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk',
-                'Content-Type': 'application/x-www-form-urlencoded',
-            }, data=data.encode())['access_token']
+                'Content-Type': 'application/json',
+                'x-client-name': 'WEB',
+                'x-client-version': self._APP_VERSION,
+            }, data=json.dumps(data, separators=(',', ':')).encode())
+
+    def _extract_formats_and_subtitles(self, broadcast, video_id):
+        feed = traverse_obj(broadcast, ('homeAway', {str.title}))
+        medium = traverse_obj(broadcast, ('type', {str}))
+        language = traverse_obj(broadcast, ('language', {str.lower}))
+        format_id = join_nonempty(feed, medium, language)
+
+        response = self._call_api({
+            'operationName': 'initPlaybackSession',
+            'query': self._GRAPHQL_PLAYBACK_QUERY,
+            'variables': {
+                'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'],
+                'deviceId': self._device_id,
+                'mediaId': broadcast['mediaId'],
+                'quality': 'PLACEHOLDER',
+                'sessionId': self._session_id,
+            },
+        }, video_id, f'{format_id} broadcast JSON', fatal=False)
+
+        playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict}))
+        m3u8_url = traverse_obj(playback, ('url', {url_or_none}))
+        token = traverse_obj(playback, ('token', {str}))
+
+        if not (m3u8_url and token):
+            errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str})))
+            if 'not entitled' in errors:
+                raise ExtractorError(errors, expected=True)
+            elif errors:  # Only warn when 'blacked out' since radio formats are available
+                self.report_warning(f'API returned errors for {format_id}: {errors}')
+            else:
+                self.report_warning(f'No formats available for {format_id} broadcast; skipping')
+            return [], {}
+
+        cdn_headers = {'x-cdn-token': token}
+        fmts, subs = self._extract_m3u8_formats_and_subtitles(
+            m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4',
+            m3u8_id=format_id, fatal=False, headers=cdn_headers)
+        for fmt in fmts:
+            fmt['http_headers'] = cdn_headers
+            fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' '))
+            fmt.setdefault('language', language)
+            if fmt.get('vcodec') == 'none' and fmt['language'] == 'en':
+                fmt['source_preference'] = 10
+
+        return fmts, subs
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        airings = self._download_json(
-            f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D',
-            video_id)['data']['Airings']
+        metadata = traverse_obj(self._download_json(
+            'https://statsapi.mlb.com/api/v1/schedule', video_id, query={
+                'gamePk': video_id,
+                'hydrate': 'broadcasts(all),statusFlags',
+            }), ('dates', ..., 'games', lambda _, v: str(v['gamePk']) == video_id and v['broadcasts'], any))
+
+        broadcasts = traverse_obj(metadata, (
+            'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF'))
 
         formats, subtitles = [], {}
-        for airing in traverse_obj(airings, lambda _, v: v['playbackUrls'][0]['href']):
-            format_id = join_nonempty('feedType', 'feedLanguage', from_dict=airing)
-            m3u8_url = traverse_obj(self._download_json(
-                airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id,
-                note=f'Downloading {format_id} stream info JSON',
-                errnote=f'Failed to download {format_id} stream info, skipping',
-                fatal=False, headers={
-                    'Authorization': self._access_token,
-                    'Accept': 'application/vnd.media-service+json; version=2',
-                }), ('stream', 'complete', {url_or_none}))
-            if not m3u8_url:
-                continue
-            f, s = self._extract_m3u8_formats_and_subtitles(
-                m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
-            formats.extend(f)
-            self._merge_subtitles(s, target=subtitles)
+        for broadcast in broadcasts:
+            fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id)
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
 
         return {
             'id': video_id,
-            'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False),
-            'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE',
+            'title': join_nonempty(
+                traverse_obj(metadata, ('officialDate', {str})),
+                traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})),
+                delim=' - '),
+            'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON',
+            'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})),
             'formats': formats,
             'subtitles': subtitles,
-            'http_headers': {'Authorization': f'Bearer {self._access_token}'},
         }
 
 
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
index aa1dcec..9d823a3 100644
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -23,7 +23,6 @@ from ..utils import (
     mimetype2ext,
     parse_qs,
     qualities,
-    remove_start,
     srt_subtitles_timecode,
     str_or_none,
     traverse_obj,
@@ -254,7 +253,16 @@ class TikTokBaseIE(InfoExtractor):
 
     def _get_subtitles(self, aweme_detail, aweme_id, user_name):
         # TODO: Extract text positioning info
+
+        EXT_MAP = {  # From lowest to highest preference
+            'creator_caption': 'json',
+            'srt': 'srt',
+            'webvtt': 'vtt',
+        }
+        preference = qualities(tuple(EXT_MAP.values()))
+
         subtitles = {}
+
         # aweme/detail endpoint subs
         captions_info = traverse_obj(
             aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
@@ -278,8 +286,8 @@ class TikTokBaseIE(InfoExtractor):
                 if not caption.get('url'):
                     continue
                 subtitles.setdefault(caption.get('lang') or 'en', []).append({
-                    'ext': remove_start(caption.get('caption_format'), 'web'),
                     'url': caption['url'],
+                    'ext': EXT_MAP.get(caption.get('Format')),
                 })
         # webpage subs
         if not subtitles:
@@ -288,9 +296,14 @@ class TikTokBaseIE(InfoExtractor):
                     self._create_url(user_name, aweme_id), aweme_id, fatal=False)
             for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
                 subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
-                    'ext': remove_start(caption.get('Format'), 'web'),
                     'url': caption['Url'],
+                    'ext': EXT_MAP.get(caption.get('Format')),
                 })
+
+        # Deprioritize creator_caption json since it can't be embedded or used by media players
+        for lang, subs_list in subtitles.items():
+            subtitles[lang] = sorted(subs_list, key=lambda x: preference(x['ext']))
+
         return subtitles
 
     def _parse_url_key(self, url_key):
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 18eb084..d10689c 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -212,16 +212,6 @@ class VimeoBaseInfoExtractor(InfoExtractor):
         owner = video_data.get('owner') or {}
         video_uploader_url = owner.get('url')
 
-        duration = int_or_none(video_data.get('duration'))
-        chapter_data = try_get(config, lambda x: x['embed']['chapters']) or []
-        chapters = [{
-            'title': current_chapter.get('title'),
-            'start_time': current_chapter.get('timecode'),
-            'end_time': next_chapter.get('timecode'),
-        } for current_chapter, next_chapter in zip(chapter_data, chapter_data[1:] + [{'timecode': duration}])]
-        if chapters and chapters[0]['start_time']:  # Chapters may not start from 0
-            chapters[:0] = [{'title': '<Untitled>', 'start_time': 0, 'end_time': chapters[0]['start_time']}]
-
         return {
             'id': str_or_none(video_data.get('id')) or video_id,
             'title': video_title,
@@ -229,8 +219,12 @@ class VimeoBaseInfoExtractor(InfoExtractor):
             'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None,
             'uploader_url': video_uploader_url,
             'thumbnails': thumbnails,
-            'duration': duration,
-            'chapters': chapters or None,
+            'duration': int_or_none(video_data.get('duration')),
+            'chapters': sorted(traverse_obj(config, (
+                'embed', 'chapters', lambda _, v: int(v['timecode']) is not None, {
+                    'title': ('title', {str}),
+                    'start_time': ('timecode', {int_or_none}),
+                })), key=lambda c: c['start_time']) or None,
             'formats': formats,
             'subtitles': subtitles,
             'live_status': live_status,
@@ -709,6 +703,39 @@ class VimeoIE(VimeoBaseInfoExtractor):
             },
         },
         {
+            # chapters must be sorted, see: https://github.com/yt-dlp/yt-dlp/issues/5308
+            'url': 'https://player.vimeo.com/video/756714419',
+            'info_dict': {
+                'id': '756714419',
+                'ext': 'mp4',
+                'title': 'Dr Arielle Schwartz - Therapeutic yoga for optimum sleep',
+                'uploader': 'Alex Howard',
+                'uploader_id': 'user54729178',
+                'uploader_url': 'https://vimeo.com/user54729178',
+                'thumbnail': r're:https://i\.vimeocdn\.com/video/1520099929-[\da-f]+-d_1280',
+                'duration': 2636,
+                'chapters': [
+                    {'start_time': 0, 'end_time': 10, 'title': '<Untitled Chapter 1>'},
+                    {'start_time': 10, 'end_time': 106, 'title': 'Welcoming Dr Arielle Schwartz'},
+                    {'start_time': 106, 'end_time': 305, 'title': 'What is therapeutic yoga?'},
+                    {'start_time': 305, 'end_time': 594, 'title': 'Vagal toning practices'},
+                    {'start_time': 594, 'end_time': 888, 'title': 'Trauma and difficulty letting go'},
+                    {'start_time': 888, 'end_time': 1059, 'title': "Dr Schwartz' insomnia experience"},
+                    {'start_time': 1059, 'end_time': 1471, 'title': 'A strategy for helping sleep issues'},
+                    {'start_time': 1471, 'end_time': 1667, 'title': 'Yoga nidra'},
+                    {'start_time': 1667, 'end_time': 2121, 'title': 'Wisdom in stillness'},
+                    {'start_time': 2121, 'end_time': 2386, 'title': 'What helps us be more able to let go?'},
+                    {'start_time': 2386, 'end_time': 2510, 'title': 'Practical tips to help ourselves'},
+                    {'start_time': 2510, 'end_time': 2636, 'title': 'Where to find out more'},
+                ],
+            },
+            'params': {
+                'http_headers': {'Referer': 'https://sleepsuperconference.com'},
+                'skip_download': 'm3u8',
+            },
+            'expected_warnings': ['Failed to parse XML: not well-formed'],
+        },
+        {
             # user playlist alias -> https://vimeo.com/258705797
             'url': 'https://vimeo.com/user26785108/newspiritualguide',
             'only_matching': True,
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 53aca38..7364e8a 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -3131,7 +3131,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
     def _extract_n_function_name(self, jscode):
         funcname, idx = self._search_regex(
-            r'''(?x)(?:\.get\("n"\)\)&&\(b=|b=String\.fromCharCode\(110\),c=a\.get\(b\)\)&&\(c=)
+            r'''(?x)
+            (?:
+                \.get\("n"\)\)&&\(b=|
+                (?:
+                    b=String\.fromCharCode\(110\)|
+                    ([a-zA-Z0-9$.]+)&&\(b="nn"\[\+\1\]
+                ),c=a\.get\(b\)\)&&\(c=
+            )
             (?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)''',
             jscode, 'Initial JS player n function name', group=('nfunc', 'idx'))
         if not idx:
diff --git a/yt_dlp/version.py b/yt_dlp/version.py
index db5b342..e641bf5 100644
--- a/yt_dlp/version.py
+++ b/yt_dlp/version.py
@@ -1,8 +1,8 @@
 # Autogenerated by devscripts/update-version.py
 
-__version__ = '2024.07.16'
+__version__ = '2024.07.25'
 
-RELEASE_GIT_HEAD = '89a161e8c62569a662deda1c948664152efcb6b4'
+RELEASE_GIT_HEAD = 'f0993391e6052ec8f7aacc286609564f226943b9'
 
 VARIANT = None
 
@@ -12,4 +12,4 @@ CHANNEL = 'stable'
 
 ORIGIN = 'yt-dlp/yt-dlp'
 
-_pkg_version = '2024.07.16'
+_pkg_version = '2024.07.25'
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-08-05 09:07:34 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-08-05 09:07:34 +0000
commit	2b676a78b5a70e28999bf4901184b45858136706 (patch)
tree	afa969209076a47b92a87c127b67e299a6e78c76
parent	Releasing progress-linux version 2024.07.16-1~progress7.99u1. (diff)
download	yt-dlp-2b676a78b5a70e28999bf4901184b45858136706.tar.xz yt-dlp-2b676a78b5a70e28999bf4901184b45858136706.zip