diff options
Diffstat (limited to 'yt_dlp/extractor/tiktok.py')
-rw-r--r-- | yt_dlp/extractor/tiktok.py | 89 |
1 files changed, 53 insertions, 36 deletions
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 7bcfded..c3505b1 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -5,10 +5,10 @@ import random import re import string import time +import urllib.parse import uuid from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..networking import HEADRequest from ..utils import ( ExtractorError, @@ -30,6 +30,7 @@ from ..utils import ( try_call, try_get, url_or_none, + urlencode_postdata, ) @@ -43,8 +44,8 @@ class TikTokBaseIE(InfoExtractor): 'iid': None, # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme 'app_name': 'musical_ly', - 'app_version': '34.1.2', - 'manifest_app_version': '2023401020', + 'app_version': '35.1.3', + 'manifest_app_version': '2023501030', # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 'aid': '0', } @@ -114,18 +115,19 @@ class TikTokBaseIE(InfoExtractor): 'universal data', display_id, end_pattern=r'</script>', default={}), ('__DEFAULT_SCOPE__', {dict})) or {} - def _call_api_impl(self, ep, query, video_id, fatal=True, + def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) if webpage_cookies.get('sid_tt'): self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value) return self._download_json( - 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, + f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ 'User-Agent': self._APP_USER_AGENT, 'Accept': 'application/json', - }, query=query) + **(headers or {}), + }, query=query, data=data) def _build_api_query(self, query): return filter_dict({ @@ -138,7 +140,7 @@ class TikTokBaseIE(InfoExtractor): 'channel': 'googleplay', 'aid': self._APP_INFO['aid'], 'app_name': self._APP_INFO['app_name'], - 'version_code': ''.join((f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.'))), + 'version_code': ''.join(f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.')), 'version_name': self._APP_INFO['app_version'], 'manifest_version_code': self._APP_INFO['manifest_app_version'], 'update_version_code': self._APP_INFO['manifest_app_version'], @@ -174,7 +176,7 @@ class TikTokBaseIE(InfoExtractor): 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), }) - def _call_api(self, ep, query, video_id, fatal=True, + def _call_api(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): if not self._APP_INFO and not self._get_next_app_info(): message = 'No working app info is available' @@ -187,9 +189,11 @@ class TikTokBaseIE(InfoExtractor): max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO for count in itertools.count(1): self.write_debug(str(self._APP_INFO)) - real_query = self._build_api_query(query) + real_query = self._build_api_query(query or {}) try: - return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote) + return self._call_api_impl( + ep, video_id, query=real_query, data=data, headers=headers, + fatal=fatal, note=note, errnote=errnote) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: message = str(e.cause or e.msg) @@ -204,17 +208,29 @@ class TikTokBaseIE(InfoExtractor): raise def _extract_aweme_app(self, aweme_id): - feed_list = self._call_api( - 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed', - errnote='Unable to download video feed').get('aweme_list') or [] - aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + aweme_detail = traverse_obj( + self._call_api('multi/aweme/detail', aweme_id, data=urlencode_postdata({ + 'aweme_ids': f'[{aweme_id}]', + 'request_source': '0', + }), headers={'X-Argus': ''}), ('aweme_details', 0, {dict})) if not aweme_detail: - raise ExtractorError('Unable to find video in feed', video_id=aweme_id) + raise ExtractorError('Unable to extract aweme detail info', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) def _extract_web_data_and_status(self, url, video_id, fatal=True): - webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or '' - video_data, status = {}, None + video_data, status = {}, -1 + + res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'}) + if res is False: + return video_data, status + + webpage, urlh = res + if urllib.parse.urlparse(urlh.url).path == '/login': + message = 'TikTok is requiring login for access to this content' + if fatal: + self.raise_login_required(message) + self.report_warning(f'{message}. {self._login_hint()}') + return video_data, status if universal_data := self._get_universal_data(webpage, video_id): self.write_debug('Found universal data for rehydration') @@ -254,7 +270,7 @@ class TikTokBaseIE(InfoExtractor): 'ext': 'srt', 'data': '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}' - for i, line in enumerate(caption_json['utterances']) if line.get('text')) + for i, line in enumerate(caption_json['utterances']) if line.get('text')), }) # feed endpoint subs if not subtitles: @@ -382,7 +398,7 @@ class TikTokBaseIE(InfoExtractor): auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt') if auth_cookie: for f in formats: - self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value) + self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value) thumbnails = [] for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak', @@ -402,7 +418,7 @@ class TikTokBaseIE(InfoExtractor): contained_music_author = traverse_obj( music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str) - is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - %s' % music_info.get('owner_handle') + is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - {}'.format(music_info.get('owner_handle')) if is_generic_og_trackname: music_track, music_author = contained_music_track or 'original sound', contained_music_author else: @@ -792,7 +808,7 @@ class TikTokIE(TikTokBaseIE): 'expected_warnings': ['Unable to find video in feed'], }, { # 1080p format - 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME + 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME: Web can only get audio 'md5': '982512017a8a917124d5a08c8ae79621', 'info_dict': { 'id': '7107337212743830830', @@ -846,7 +862,7 @@ class TikTokIE(TikTokBaseIE): }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', - 'only_matching': True + 'only_matching': True, }] def _real_extract(self, url): @@ -1026,7 +1042,8 @@ class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes shoul for retry in self.RetryManager(): try: post_list = self._call_api( - self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}', + self._API_ENDPOINT, display_id, query=query, + note=f'Downloading video list page {page}', errnote='Unable to download video list') except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: @@ -1059,17 +1076,17 @@ class TikTokSoundIE(TikTokBaseListIE): 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en', 'playlist_mincount': 100, 'info_dict': { - 'id': '6956990112127585029' + 'id': '6956990112127585029', }, - 'expected_warnings': ['Retrying'] + 'expected_warnings': ['Retrying'], }, { # Actual entries are less than listed video count 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381', 'playlist_mincount': 2182, 'info_dict': { - 'id': '7036843036118469381' + 'id': '7036843036118469381', }, - 'expected_warnings': ['Retrying'] + 'expected_warnings': ['Retrying'], }] @@ -1085,11 +1102,11 @@ class TikTokEffectIE(TikTokBaseListIE): 'info_dict': { 'id': '1258156', }, - 'expected_warnings': ['Retrying'] + 'expected_warnings': ['Retrying'], }, { # Different entries between mobile and web, depending on region 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565', - 'only_matching': True + 'only_matching': True, }] @@ -1106,16 +1123,16 @@ class TikTokTagIE(TikTokBaseListIE): 'id': '46294678', 'title': 'hello2018', }, - 'expected_warnings': ['Retrying'] + 'expected_warnings': ['Retrying'], }, { 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1', - 'only_matching': True + 'only_matching': True, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id, headers={ - 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' + 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)', }) tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID') return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id) @@ -1129,17 +1146,17 @@ class TikTokCollectionIE(TikTokBaseIE): 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462', 'info_dict': { 'id': '7371330159376370462', - 'title': 'imanoreotwe-count-test' + 'title': 'imanoreotwe-count-test', }, - 'playlist_count': 9 + 'playlist_count': 9, }, { # tests returning multiple pages of a large collection 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875', 'info_dict': { 'id': '7111887189571160875', - 'title': 'imanoreotwe-%F0%9F%98%82' + 'title': 'imanoreotwe-%F0%9F%98%82', }, - 'playlist_mincount': 100 + 'playlist_mincount': 100, }] _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/' _PAGE_COUNT = 30 |