yt_dlp/extractor/pladform.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    determine_ext,
    int_or_none,
    parse_qs,
    qualities,
    xpath_text,
)


class PladformIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
                            (?:
                                out\.pladform\.ru/player|
                                static\.pladform\.ru/player\.swf
                            )
                            \?.*\bvideoid=|
                            video\.pladform\.ru/catalog/video/videoid/
                        )
                        (?P<id>\d+)
                    '''
    _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1']
    _TESTS = [{
        'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282',
        'info_dict': {
            'id': '6216d548e755edae6e8280667d774791',
            'ext': 'mp4',
            'timestamp': 1406117012,
            'title': 'Гарик Мартиросян и Гарик Харламов - Кастинг на концерт ко Дню милиции',
            'age_limit': 0,
            'upload_date': '20140723',
            'thumbnail': str,
            'view_count': int,
            'description': str,
            'uploader_id': '12082',
            'uploader': 'Comedy Club',
            'duration': 367,
        },
        'expected_warnings': ['HTTP Error 404: Not Found']
    }, {
        'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0',
        'md5': '53362fac3a27352da20fa2803cc5cd6f',
        'info_dict': {
            'id': '3777899',
            'ext': 'mp4',
            'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко',
            'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 3190,
        },
    }, {
        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0',
        'only_matching': True,
    }, {
        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        qs = parse_qs(url)
        pl = qs.get('pl', ['1'])[0]

        video = self._download_xml(
            'http://out.pladform.ru/getVideo', video_id, query={
                'pl': pl,
                'videoid': video_id,
            }, fatal=False)

        def fail(text):
            raise ExtractorError(
                '%s returned error: %s' % (self.IE_NAME, text),
                expected=True)

        if not video:
            targetUrl = self._request_webpage(url, video_id, note='Resolving final URL').url
            if targetUrl == url:
                raise ExtractorError('Can\'t parse page')
            return self.url_result(targetUrl)

        if video.tag == 'error':
            fail(video.text)

        quality = qualities(('ld', 'sd', 'hd'))

        formats = []
        for src in video.findall('./src'):
            if src is None:
                continue
            format_url = src.text
            if not format_url:
                continue
            if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
                    m3u8_id='hls', fatal=False))
            else:
                formats.append({
                    'url': src.text,
                    'format_id': src.get('quality'),
                    'quality': quality(src.get('quality')),
                })

        if not formats:
            error = xpath_text(video, './cap', 'error', default=None)
            if error:
                fail(error)

        webpage = self._download_webpage(
            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id,
            video_id)

        title = self._og_search_title(webpage, fatal=False) or xpath_text(
            video, './/title', 'title', fatal=True)
        description = self._search_regex(
            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False)
        thumbnail = self._og_search_thumbnail(webpage) or xpath_text(
            video, './/cover', 'cover')

        duration = int_or_none(xpath_text(video, './/time', 'duration'))
        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit'))

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'age_limit': age_limit,
            'formats': formats,
        }