diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-05 09:06:32 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-08-05 09:06:32 +0000 |
commit | 58349d8104500263d23cb33e8650919d8a40d90a (patch) | |
tree | ecccf4e2ff9c1d1e2b08c94c38a36c52dc23b88a /yt_dlp/extractor | |
parent | Adding debian version 2024.07.01-1. (diff) | |
download | yt-dlp-58349d8104500263d23cb33e8650919d8a40d90a.tar.xz yt-dlp-58349d8104500263d23cb33e8650919d8a40d90a.zip |
Merging upstream version 2024.07.02.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r-- | yt_dlp/extractor/banbye.py | 71 | ||||
-rw-r--r-- | yt_dlp/extractor/murrtube.py | 131 | ||||
-rw-r--r-- | yt_dlp/extractor/zaiko.py | 18 |
3 files changed, 142 insertions, 78 deletions
diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index d10bdf8..148a170 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -4,9 +4,13 @@ import urllib.parse from .common import InfoExtractor from ..utils import ( InAdvancePagedList, + determine_ext, format_field, + int_or_none, + join_nonempty, traverse_obj, unified_timestamp, + url_or_none, ) @@ -30,6 +34,7 @@ class BanByeBaseIE(InfoExtractor): class BanByeIE(BanByeBaseIE): _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P<id>[\w-]+)' _TESTS = [{ + # ['src']['mp4']['levels'] direct mp4 urls only 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', 'info_dict': { @@ -58,6 +63,7 @@ class BanByeIE(BanByeBaseIE): }, 'playlist_mincount': 9, }, { + # ['src']['mp4']['levels'] direct mp4 urls only 'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD', 'info_dict': { 'id': 'v_kb6_o1Kyq-CD', @@ -77,6 +83,48 @@ class BanByeIE(BanByeBaseIE): 'view_count': int, 'comment_count': int, }, + }, { + # ['src']['hls']['levels'] variant m3u8 urls only; master m3u8 is 404 + 'url': 'https://banbye.com/watch/v_a_gPFuC9LoW5', + 'info_dict': { + 'id': 'v_a_gPFuC9LoW5', + 'ext': 'mp4', + 'title': 'md5:183524056bebdfa245fd6d214f63c0fe', + 'description': 'md5:943ac87287ca98d28d8b8797719827c6', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'upload_date': '20231113', + 'timestamp': 1699874062, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.banbye.com/video/v_a_gPFuC9LoW5/96.webp', + 'tags': ['jaszczur', 'sejm', 'lewica', 'polska', 'ukrainizacja', 'pierwszeposiedzeniesejmu'], + }, + 'expected_warnings': ['Failed to download m3u8'], + }, { + # ['src']['hls']['masterPlaylist'] m3u8 only + 'url': 'https://banbye.com/watch/v_B0rsKWsr-aaa', + 'info_dict': { + 'id': 'v_B0rsKWsr-aaa', + 'ext': 'mp4', + 'title': 'md5:00b254164b82101b3f9e5326037447ed', + 'description': 'md5:3fd8b48aa81954ba024bc60f5de6e167', + 'uploader': 'PSTV Piotr Szlachtowicz ', + 'channel_id': 'ch_KV9EVObkB9wB', + 'channel_url': 'https://banbye.com/channel/ch_KV9EVObkB9wB', + 'upload_date': '20240629', + 'timestamp': 1719646816, + 'duration': 2377, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.banbye.com/video/v_B0rsKWsr-aaa/96.webp', + 'tags': ['Biden', 'Trump', 'Wybory', 'USA'], + }, }] def _real_extract(self, url): @@ -91,11 +139,24 @@ class BanByeIE(BanByeBaseIE): 'id': f'{quality}p', 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', } for quality in [48, 96, 144, 240, 512, 1080]] - formats = [{ - 'format_id': f'http-{quality}p', - 'quality': quality, - 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', - } for quality in data['quality']] + + formats = [] + url_data = self._download_json(f'{self._API_BASE}/videos/{video_id}/url', video_id, data=b'') + if master_url := traverse_obj(url_data, ('src', 'hls', 'masterPlaylist', {url_or_none})): + formats = self._extract_m3u8_formats(master_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + for format_id, format_url in traverse_obj(url_data, ( + 'src', ('mp4', 'hls'), 'levels', {dict.items}, lambda _, v: url_or_none(v[1]))): + ext = determine_ext(format_url) + is_hls = ext == 'm3u8' + formats.append({ + 'url': format_url, + 'ext': 'mp4' if is_hls else ext, + 'format_id': join_nonempty(is_hls and 'hls', format_id), + 'protocol': 'm3u8_native' if is_hls else 'https', + 'height': int_or_none(format_id), + }) + self._remove_duplicate_formats(formats) return { 'id': video_id, diff --git a/yt_dlp/extractor/murrtube.py b/yt_dlp/extractor/murrtube.py index 3b39a1b..9067b87 100644 --- a/yt_dlp/extractor/murrtube.py +++ b/yt_dlp/extractor/murrtube.py @@ -5,114 +5,111 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, OnDemandPagedList, - determine_ext, - int_or_none, - try_get, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + parse_count, + remove_end, + update_url, + urlencode_postdata, ) class MurrtubeIE(InfoExtractor): - _WORKING = False _VALID_URL = r'''(?x) (?: murrtube:| - https?://murrtube\.net/videos/(?P<slug>[a-z0-9\-]+)\- + https?://murrtube\.net/(?:v/|videos/(?P<slug>[a-z0-9-]+?)-) ) - (?P<id>[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}) + (?P<id>[A-Z0-9]{4}|[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) ''' - _TEST = { + _TESTS = [{ 'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0', - 'md5': '169f494812d9a90914b42978e73aa690', + 'md5': '70380878a77e8565d4aea7f68b8bbb35', 'info_dict': { - 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0', + 'id': 'ca885d8456b95de529b6723b158032e11115d', 'ext': 'mp4', 'title': 'Inferno X Skyler', 'description': 'Humping a very good slutty sheppy (roomate)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 284, 'uploader': 'Inferno Wolf', 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/ekbs3zcfvuynnqfx72nn2tkokvsd', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + }, { + 'url': 'https://murrtube.net/v/0J2Q', + 'md5': '31262f6ac56f0ca75e5a54a0f3fefcb6', + 'info_dict': { + 'id': '8442998c52134968d9caa36e473e1a6bac6ca', + 'ext': 'mp4', + 'uploader': 'Hayel', + 'title': 'Who\'s in charge now?', + 'description': 'md5:795791e97e5b0f1805ea84573f02a997', + 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/fb1ojjwiucufp34ya6hxu5vfqi5s', 'comment_count': int, 'view_count': int, 'like_count': int, - 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'], }, - } + }] - def _download_gql(self, video_id, op, note=None, fatal=True): - result = self._download_json( - 'https://murrtube.net/graphql', - video_id, note, data=json.dumps(op).encode(), fatal=fatal, - headers={'Content-Type': 'application/json'}) - return result['data'] + def _extract_count(self, name, html): + return parse_count(self._search_regex( + rf'([\d,]+)\s+<span[^>]*>{name}</span>', html, name, default=None)) + + def _real_initialize(self): + homepage = self._download_webpage( + 'https://murrtube.net', None, note='Getting session token') + self._request_webpage( + 'https://murrtube.net/accept_age_check', None, 'Setting age cookie', + data=urlencode_postdata(self._hidden_inputs(homepage))) def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_gql(video_id, { - 'operationName': 'Medium', - 'variables': { - 'id': video_id, - }, - 'query': '''\ -query Medium($id: ID!) { - medium(id: $id) { - title - description - key - duration - commentsCount - likesCount - viewsCount - thumbnailKey - tagList - user { - name - __typename - } - __typename - } -}'''}) - meta = data['medium'] - - storage_url = 'https://storage.murrtube.net/murrtube/' - format_url = storage_url + meta.get('key', '') - thumbnail = storage_url + meta.get('thumbnailKey', '') - - if determine_ext(format_url) == 'm3u8': - formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False) - else: - formats = [{'url': format_url}] + if video_id.startswith('murrtube:'): + raise ExtractorError('Support for murrtube: prefix URLs is broken') + video_page = self._download_webpage(url, video_id) + video_attrs = extract_attributes(get_element_html_by_id('video', video_page)) + playlist = update_url(video_attrs['data-url'], query=None) + video_id = self._search_regex(r'/([\da-f]+)/index.m3u8', playlist, 'video id') return { 'id': video_id, - 'title': meta.get('title'), - 'description': meta.get('description'), - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': int_or_none(meta.get('duration')), - 'uploader': try_get(meta, lambda x: x['user']['name']), - 'view_count': meta.get('viewsCount'), - 'like_count': meta.get('likesCount'), - 'comment_count': meta.get('commentsCount'), - 'tags': meta.get('tagList'), + 'title': remove_end(self._og_search_title(video_page), ' - Murrtube'), 'age_limit': 18, + 'formats': self._extract_m3u8_formats(playlist, video_id, 'mp4'), + 'description': self._og_search_description(video_page), + 'thumbnail': update_url(self._og_search_thumbnail(video_page, default=''), query=None) or None, + 'uploader': clean_html(get_element_by_class('pl-1 is-size-6 has-text-lighter', video_page)), + 'view_count': self._extract_count('Views', video_page), + 'like_count': self._extract_count('Likes', video_page), + 'comment_count': self._extract_count('Comments', video_page), } -class MurrtubeUserIE(MurrtubeIE): # XXX: Do not subclass from concrete IE +class MurrtubeUserIE(InfoExtractor): _WORKING = False IE_DESC = 'Murrtube user profile' _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$' - _TEST = { + _TESTS = [{ 'url': 'https://murrtube.net/stormy', 'info_dict': { 'id': 'stormy', }, 'playlist_mincount': 27, - } + }] _PAGE_SIZE = 10 + def _download_gql(self, video_id, op, note=None, fatal=True): + result = self._download_json( + 'https://murrtube.net/graphql', + video_id, note, data=json.dumps(op).encode(), fatal=fatal, + headers={'Content-Type': 'application/json'}) + return result['data'] + def _fetch_page(self, username, user_id, page): data = self._download_gql(username, { 'operationName': 'Media', diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index c8c4ec0..4563b7b 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -66,7 +66,9 @@ class ZaikoIE(ZaikoBaseIE): stream_meta['stream-access']['video_source'], video_id, 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) player_meta = self._parse_vue_element_attr('player', player_page, video_id) - status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + initial_event_info = traverse_obj(player_meta, ('initial_event_info', {dict})) or {} + + status = traverse_obj(initial_event_info, ('status', {str})) live_status, msg, expected = { 'vod': ('was_live', 'No VOD stream URL was found', False), 'archiving': ('post_live', 'Event VOD is still being processed', True), @@ -80,14 +82,20 @@ class ZaikoIE(ZaikoBaseIE): 'cancelled': ('not_live', 'Event has been cancelled', True), }.get(status) or ('not_live', f'Unknown event status "{status}"', False) - stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + if traverse_obj(initial_event_info, ('is_jwt_protected', {bool})): + stream_url = self._download_json( + initial_event_info['jwt_token_url'], video_id, 'Downloading JWT-protected stream URL', + 'Failed to download JWT-protected stream URL')['playback_url'] + else: + stream_url = traverse_obj(initial_event_info, ('endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( stream_url, video_id, live=True, fatal=False) if stream_url else [] if not formats: self.raise_no_formats(msg, expected=expected) thumbnail_urls = [ - traverse_obj(player_meta, ('initial_event_info', 'poster_url')), + traverse_obj(initial_event_info, ('poster_url', {url_or_none})), self._og_search_thumbnail(self._download_webpage( f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''), ] @@ -103,9 +111,7 @@ class ZaikoIE(ZaikoBaseIE): 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), 'categories': ('event', 'genres', ..., {lambda x: x or None}), }), - **traverse_obj(player_meta, ('initial_event_info', { - 'alt_title': ('title', {str}), - })), + 'alt_title': traverse_obj(initial_event_info, ('title', {str})), 'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)], } |