doc/misc/parsegrammar.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

############################################################################
# Copyright (C) Internet Systems Consortium, Inc. ("ISC")
#
# SPDX-License-Identifier: MPL-2.0
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, you can obtain one at https://mozilla.org/MPL/2.0/.
#
# See the COPYRIGHT file distributed with this work for additional
# information regarding copyright ownership.
############################################################################

"""
Read ISC config grammar description produced by "cfg_test --grammar",
transform it into JSON, and print it to stdout.

Beware: This parser is pretty dumb and heavily depends on cfg_test output
format. See parse_mapbody() for more details.

Maps are recursively parsed into sub-dicts, all other elements (lists etc.)
are left intact and returned as one string.

Output example from named.conf grammar showing three variants follow.
Keys "_flags" and "_id" are present only if non-empty. Key "_grammar" denotes
end node, key "_mapbody" denotes a nested map.

{
    "acl": {
        "_flags": [
            "may occur multiple times"
        ],
        "_grammar": "<string> { <address_match_element>; ... }"
    },
    "http": {
        "_flags": [
            "may occur multiple times"
        ],
        "_id": "<string>",
        "_mapbody": {
            "endpoints": {
                "_grammar": "{ <quoted_string>; ... }"
            },
            "streams-per-connection": {
                "_grammar": "<integer>"
            }
        }
    },
    "options": {
        "_mapbody": {
            "rate-limit": {
                "_mapbody": {
                    "all-per-second": {
                        "_grammar": "<integer>"
                    }
                }
            }
        }
    }
}
"""
import fileinput
import json
import re

FLAGS = [
    "may occur multiple times",
    "obsolete",
    "deprecated",
    "experimental",
    "test only",
]

KEY_REGEX = re.compile("[a-zA-Z0-9-]+")


def split_comments(line):
    """Split line on comment boundary and strip right-side whitespace.
    Supports only #, //, and /* comments which end at the end of line.
    It does NOT handle:
    - quoted strings
    - /* comments which do not end at line boundary
    - multiple /* comments on a single line
    """
    assert '"' not in line, 'lines with " are not supported'
    data_end_idx = len(line)
    for delimiter in ["#", "//", "/*"]:
        try:
            data_end_idx = min(line.index(delimiter), data_end_idx)
        except ValueError:
            continue
        if delimiter == "/*":
            # sanity checks
            if not line.rstrip().endswith("*/"):
                raise NotImplementedError(
                    "unsupported /* comment, does not end at the end of line", line
                )
            if "/*" in line[data_end_idx + 1 :]:
                raise NotImplementedError(
                    "unsupported line with multiple /* comments", line
                )

    noncomment = line[:data_end_idx]
    comment = line[data_end_idx:]
    return noncomment, comment


def parse_line(filein):
    """Consume single line from input, return non-comment and comment."""
    for line in filein:
        line, comment = split_comments(line)
        line = line.strip()
        comment = comment.strip()
        if not line:
            continue
        yield line, comment


def parse_flags(comments):
    """Extract known flags from comments. Must match exact strings used by cfg_test."""
    out = []
    for flag in FLAGS:
        if flag in comments:
            out.append(flag)
    return out


def parse_mapbody(filein):
    """Parse body of a "map" in ISC config format.

    Input lines can be only:
    - whitespace & comments only -> ignore
    - <keyword> <anything>; -> store <anything> as "_grammar" for this keyword
    - <keyword> <anything> { -> parse sub-map and store (optional) <anything> as "_id",
                                producing nested dict under "_mapbody"
    Also store known strings found at the end of line in "_flags".

    Returns:
    - tuple (map dict, map comment) when }; line is reached
    - map dict when we run out of lines without the closing };
    """
    thismap = {}
    for line, comment in parse_line(filein):
        flags = parse_flags(comment)
        if line == "};":  # end of a nested map
            return thismap, flags

        # first word - a map key name
        # beware: some statements do not have parameters, e.g. "null;"
        key = line.split()[0].rstrip(";")
        # map key sanity check
        if not KEY_REGEX.fullmatch(key):
            raise NotImplementedError("suspicious keyword detected", line)

        # omit keyword from the grammar
        grammar = line[len(key) :].strip()
        # also skip final ; or {
        grammar = grammar[:-1].strip()

        thismap[key] = {}
        if line.endswith("{"):
            # nested map, recurse, but keep "extra identifiers" if any
            try:
                subkeys, flags = parse_mapbody(filein)
            except ValueError:
                raise ValueError("unfinished nested map, missing }; detected") from None
            if flags:
                thismap[key]["_flags"] = flags
            if grammar:
                # for lines which look like "view <name> {" store "<name>"
                thismap[key]["_id"] = grammar
            thismap[key]["_mapbody"] = subkeys
        else:
            assert line.endswith(";")
            if flags:
                thismap[key]["_flags"] = flags
            thismap[key]["_grammar"] = grammar

    # Ran out of lines: can happen only on the end of the top-level map-body!
    # Intentionally do not return second parameter to cause ValueError
    # if we reach this spot with a missing }; in a nested map.
    assert len(thismap)
    return thismap


def main():
    """Read stdin or filename provided on command line"""
    with fileinput.input() as filein:
        grammar = parse_mapbody(filein)
    print(json.dumps(grammar, indent=4))


if __name__ == "__main__":
    main()