summaryrefslogtreecommitdiffstats
path: root/yt_dlp/postprocessor/metadataparser.py
blob: 1d6054294eaaa4d80374950e099c83f191631768 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re

from .common import PostProcessor
from ..utils import Namespace, filter_dict, function_with_repr


class MetadataParserPP(PostProcessor):
    def __init__(self, downloader, actions):
        super().__init__(downloader)
        self._actions = []
        for f in actions:
            action, *args = f
            assert action in self.Actions
            self._actions.append(action(self, *args))

    @classmethod
    def validate_action(cls, action, *data):
        """Each action can be:
                (Actions.INTERPRET, from, to) OR
                (Actions.REPLACE, field, search, replace)
        """
        if action not in cls.Actions:
            raise ValueError(f'{action!r} is not a valid action')
        action(cls, *data)  # So this can raise error to validate

    @staticmethod
    def field_to_template(tmpl):
        if re.match(r'[a-zA-Z_]+$', tmpl):
            return f'%({tmpl})s'

        from ..YoutubeDL import YoutubeDL
        err = YoutubeDL.validate_outtmpl(tmpl)
        if err:
            raise err
        return tmpl

    @staticmethod
    def format_to_regex(fmt):
        r"""
        Converts a string like
           '%(title)s - %(artist)s'
        to a regex like
           '(?P<title>.+)\ \-\ (?P<artist>.+)'
        """
        if not re.search(r'%\(\w+\)s', fmt):
            return fmt
        lastpos = 0
        regex = ''
        # replace %(..)s with regex group and escape other string parts
        for match in re.finditer(r'%\((\w+)\)s', fmt):
            regex += re.escape(fmt[lastpos:match.start()])
            regex += rf'(?P<{match.group(1)}>.+)'
            lastpos = match.end()
        if lastpos < len(fmt):
            regex += re.escape(fmt[lastpos:])
        return regex

    def run(self, info):
        for f in self._actions:
            f(info)
        return [], info

    @function_with_repr
    def interpretter(self, inp, out):
        def f(info):
            data_to_parse = self._downloader.evaluate_outtmpl(template, info)
            self.write_debug(f'Searching for {out_re.pattern!r} in {template!r}')
            match = out_re.search(data_to_parse)
            if match is None:
                self.to_screen(f'Could not interpret {inp!r} as {out!r}')
                return
            for attribute, value in filter_dict(match.groupdict()).items():
                info[attribute] = value
                self.to_screen(f'Parsed {attribute} from {template!r}: {value!r}')

        template = self.field_to_template(inp)
        out_re = re.compile(self.format_to_regex(out))
        return f

    @function_with_repr
    def replacer(self, field, search, replace):
        def f(info):
            val = info.get(field)
            if val is None:
                self.to_screen(f'Video does not have a {field}')
                return
            elif not isinstance(val, str):
                self.report_warning(f'Cannot replace in field {field} since it is a {type(val).__name__}')
                return
            self.write_debug(f'Replacing all {search!r} in {field} with {replace!r}')
            info[field], n = search_re.subn(replace, val)
            if n:
                self.to_screen(f'Changed {field} to: {info[field]}')
            else:
                self.to_screen(f'Did not find {search!r} in {field}')

        search_re = re.compile(search)
        return f

    Actions = Namespace(INTERPRET=interpretter, REPLACE=replacer)


class MetadataFromFieldPP(MetadataParserPP):
    @classmethod
    def to_action(cls, f):
        match = re.match(r'(?s)(?P<in>.*?)(?<!\\):(?P<out>.+)$', f)
        if match is None:
            raise ValueError(f'it should be FROM:TO, not {f!r}')
        return (
            cls.Actions.INTERPRET,
            match.group('in').replace('\\:', ':'),
            match.group('out'),
        )

    def __init__(self, downloader, formats):
        super().__init__(downloader, [self.to_action(f) for f in formats])


# Deprecated
class MetadataFromTitlePP(MetadataParserPP):
    def __init__(self, downloader, titleformat):
        super().__init__(downloader, [(self.Actions.INTERPRET, 'title', titleformat)])
        self.deprecation_warning(
            'yt_dlp.postprocessor.MetadataFromTitlePP is deprecated '
            'and may be removed in a future version. Use yt_dlp.postprocessor.MetadataFromFieldPP instead')