third_party/rust/jsparagus/js_parser/parse_esgrammar.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545

"""Parse a grammar written in ECMArkup."""

from __future__ import annotations
# mypy: no-implicit-optional

import os
import collections
from typing import Dict, Iterable, Optional, Tuple

from jsparagus import parse_pgen, gen, grammar, extension, types
from jsparagus.lexer import LexicalGrammar
from jsparagus.ordered import OrderedSet, OrderedFrozenSet


ESGrammarLexer = LexicalGrammar(
    # the operators and keywords:
    "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
    "but empty here lookahead no not of one or returns through Some None impl for let",

    NL="\n",

    # any number of colons together
    EQ=r':+',

    # terminals of the ES grammar, quoted with backticks
    T=r'`[^` \n]+`|```',

    # also terminals, denoting control characters
    CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',

    # nonterminals/types that will be followed by parameters
    NTCALL=r'[A-Za-z]\w*(?=[\[<])',

    # nonterminals (also, boolean parameters and type names)
    NT=r'[A-Za-z]\w*',

    # nonterminals wrapped in vertical bars for no apparent reason
    NTALT=r'\|[A-Z]\w+\|',

    # the spec also gives a few productions names
    PRODID=r'#[A-Za-z]\w*',

    # prose not wrapped in square brackets
    # To avoid conflict with the `>` token, this is recognized only after a space.
    PROSE=r'(?<= )>[^\n]*',

    # prose wrapped in square brackets
    WPROSE=r'\[>[^]]*\]',

    # expression denoting a matched terminal or nonterminal
    MATCH_REF=r'\$(?:0|[1-9][0-9]*)',

    # the spec also gives a few productions names
    RUSTCOMMENT=r'//.*\n',
)


ESGrammarParser = gen.compile(
    parse_pgen.load_grammar(
        os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))


SIGIL_FALSE = '~'
SIGIL_TRUE = '+'

# Abbreviations for single-character terminals, used in the lexical grammar.
ECMASCRIPT_CODE_POINTS = {
    # From <https://tc39.es/ecma262/#table-31>
    '<ZWNJ>': grammar.Literal('\u200c'),
    '<ZWJ>': grammar.Literal('\u200d'),
    '<ZWNBSP>': grammar.Literal('\ufeff'),

    # From <https://tc39.es/ecma262/#table-32>
    '<TAB>': grammar.Literal('\t'),
    '<VT>': grammar.Literal('\u000b'),
    '<FF>': grammar.Literal('\u000c'),
    '<SP>': grammar.Literal(' '),
    '<NBSP>': grammar.Literal('\u00a0'),
    # <ZWNBSP> already defined above
    '<USP>': grammar.UnicodeCategory('Zs'),

    # From <https://tc39.es/ecma262/#table-33>
    '<LF>': grammar.Literal('\u000a'),
    '<CR>': grammar.Literal('\u000d'),
    '<LS>': grammar.Literal('\u2028'),
    '<PS>': grammar.Literal('\u2028'),
}


class ESGrammarBuilder:
    def __init__(self, terminal_names):
        # Names of terminals that are written as nonterminals in the grammar.
        # For example, "BooleanLiteral" is a terminal name when parsing the
        # syntactic grammar.
        if terminal_names is None:
            terminal_names = frozenset()
        self.terminal_names = frozenset(terminal_names)
        self.reset()

    def reset(self):
        self.lexer = None
        # This is how full-parsing and lazy-parsing are implemented, using
        # different traits.
        #
        # This field contains the Rust's trait used for calling the method.
        # When a CallMethod is generated, it is assumed to be a function of
        # this trait. The trait is used by the Rust backend to generate
        # multiple backends which are implementing different set of traits.
        # Having the trait on the function call is useful as a way to filter
        # functions calls at code-generation time.
        #
        # This field is updated by the `rust_param_impl`, which is used in
        # grammar extensions, and visited before producing any CallMethod.
        self.method_trait = "AstBuilder"

    def rust_edsl(self, impl, grammar):
        return extension.GrammarExtension(impl, grammar, self.lexer.filename)

    def rust_param_impl(self, trait, for_type, param):
        self.method_trait = trait
        return extension.ImplFor(param, trait, for_type)

    def rust_impl(self, trait, impl_type):
        return self.rust_param_impl(trait, impl_type, [])

    def rust_nt_def(self, lhs, rhs_line):
        # Right now, only focus on the syntactic grammar, and assume that all
        # rules are patching existing grammar production by adding code.
        return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))

    def rust_rhs_line(self, symbols):
        return self.rhs_line(None, symbols, None, None)

    def rust_expr(self, expr):
        assert isinstance(expr, grammar.CallMethod)
        return expr

    def empty(self):
        return []

    def single(self, x):
        return [x]

    def append(self, x, y):
        return x + [y]

    def concat(self, x, y):
        return x + y

    def blank_line(self):
        return []

    def nt_def_to_list(self, nt_def):
        return [nt_def]

    def to_production(self, lhs, i, rhs, is_sole_production):
        """Wrap a list of grammar symbols `rhs` in a Production object."""
        body, reducer, condition = rhs
        if reducer is None:
            reducer = self.default_reducer(lhs, i, body, is_sole_production)
        return grammar.Production(body, reducer, condition=condition)

    def default_reducer(self, lhs, i, body, is_sole_production):
        assert isinstance(lhs, grammar.Nt)
        nt_name = lhs.name

        nargs = sum(1 for e in body if grammar.is_concrete_element(e))
        if is_sole_production:
            method_name = nt_name
        else:
            method_name = '{} {}'.format(nt_name, i)
        return self.expr_call(method_name, tuple(range(nargs)), None)

    def needs_asi(self, lhs, p):
        """True if p is a production in which ASI can happen."""
        # The purpose of the fake ForLexicalDeclaration production is to have a
        # copy of LexicalDeclaration that does not trigger ASI.
        #
        # Two productions have body == [";"] -- one for EmptyStatement and one
        # for ClassMember. Neither should trigger ASI.
        #
        # The only other semicolons that should not trigger ASI are the ones in
        # `for` statement productions, which happen to be exactly those
        # semicolons that are not at the end of a production.
        return (not (isinstance(lhs, grammar.Nt)
                     and lhs.name == 'ForLexicalDeclaration')
                and len(p.body) > 1
                and p.body[-1] == ';')

    def apply_asi(self, p, reducer_was_autogenerated):
        """Return two rules based on p, so that ASI can be applied."""
        assert isinstance(p.reducer, grammar.CallMethod)

        if reducer_was_autogenerated:
            # Don't pass the semicolon to the method.
            reducer = self.expr_call(p.reducer.method,
                                     p.reducer.args[:-1],
                                     None)
        else:
            reducer = p.reducer

        # Except for do-while loops, check at runtime that ASI occurs only at
        # the end of a line.
        if (len(p.body) == 7
                and p.body[0] == 'do'
                and p.body[2] == 'while'
                and p.body[3] == '('
                and p.body[5] == ')'
                and p.body[6] == ';'):
            code = "do_while_asi"
        else:
            code = "asi"

        return [
            # The preferred production, with the semicolon in.
            p.copy_with(body=p.body[:],
                        reducer=reducer),
            # The fallback production, performing ASI.
            p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
                        reducer=reducer),
        ]

    def expand_lexical_rhs(self, rhs):
        body, reducer, condition = rhs
        out = []
        for e in body:
            if isinstance(e, str):
                # The terminal symbols of the lexical grammar are characters, so
                # add each character of this string as a separate element.
                out += [grammar.Literal(ch) for ch in e]
            else:
                out.append(e)
        return [out, reducer, condition]

    def nt_def(self, nt_type, lhs, eq, rhs_list):
        has_sole_production = (len(rhs_list) == 1)
        production_list = []
        for i, rhs in enumerate(rhs_list):
            if eq == ':':
                # Syntactic grammar. A hack is needed for ASI.
                reducer_was_autogenerated = rhs[1] is None
                p = self.to_production(lhs, i, rhs, has_sole_production)
                if self.needs_asi(lhs, p):
                    production_list += self.apply_asi(p, reducer_was_autogenerated)
                else:
                    production_list.append(p)
            elif eq == '::':
                # Lexical grammar. A hack is needed to replace multicharacter
                # terminals like `!==` into sequences of character terminals.
                rhs = self.expand_lexical_rhs(rhs)
                p = self.to_production(lhs, i, rhs, has_sole_production)
                production_list.append(p)
        return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))

    def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
        return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])

    def nt_lhs_no_params(self, name):
        return grammar.Nt(name, ())

    def nt_lhs_with_params(self, name, params):
        return grammar.Nt(name, tuple(params))

    def simple_type(self, name):
        return types.Type(name)

    def lifetime_type(self, name):
        return types.Lifetime(name)

    def parameterized_type(self, name, args):
        return types.Type(name, tuple(args))

    def t_list_line(self, terminals):
        return terminals

    def terminal(self, t):
        assert t[0] == "`"
        assert t[-1] == "`"
        return t[1:-1]

    def terminal_chr(self, chr):
        raise ValueError("FAILED: %r" % chr)

    def rhs_line(self, ifdef, rhs, reducer, _prodid):
        return (rhs, reducer, ifdef)

    def rhs_line_prose(self, prose):
        return ([prose], None, None)

    def empty_rhs(self):
        return []

    def expr_match_ref(self, token):
        assert token.startswith('$')
        return int(token[1:])

    def expr_call(self, method, args, fallible):
        # NOTE: Currently "AstBuilder" functions are made fallible using the
        # fallible_methods taken from some Rust code which extract this
        # information to produce a JSON file.
        if self.method_trait == "AstBuilder":
            fallible = None
        return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
                                  fallible is not None)

    def expr_some(self, expr):
        return grammar.Some(expr)

    def expr_none(self):
        return None

    def ifdef(self, value, nt):
        return nt, value

    def optional(self, nt):
        return grammar.Optional(nt)

    def but_not(self, nt, exclusion):
        _, exclusion = exclusion
        return grammar.Exclude(nt, [exclusion])
        # return ('-', nt, exclusion)

    def but_not_one_of(self, nt, exclusion_list):
        exclusion_list = [exclusion for _, exclusion in exclusion_list]
        return grammar.Exclude(nt, exclusion_list)
        # return ('-', nt, exclusion_list)

    def no_line_terminator_here(self, lt):
        if lt not in ('LineTerminator', '|LineTerminator|'):
            raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
        return grammar.NoLineTerminatorHere

    def nonterminal(self, name):
        if name in self.terminal_names:
            return name
        return grammar.Nt(name, ())

    def nonterminal_apply(self, name, args):
        if name in self.terminal_names:
            raise ValueError("parameters applied to terminal {!r}".format(name))
        if len(set(k for k, expr in args)) != len(args):
            raise ValueError("parameter passed multiple times")
        return grammar.Nt(name, tuple(args))

    def arg_expr(self, sigil, argname):
        if sigil == '?':
            return (argname, grammar.Var(argname))
        else:
            return (argname, sigil)

    def sigil_false(self):
        return False

    def sigil_true(self):
        return True

    def exclusion_terminal(self, t):
        return ("t", t)

    def exclusion_nonterminal(self, nt):
        return ("nt", nt)

    def exclusion_chr_range(self, c1, c2):
        return ("range", c1, c2)

    def la_eq(self, t):
        return grammar.LookaheadRule(OrderedFrozenSet([t]), True)

    def la_ne(self, t):
        return grammar.LookaheadRule(OrderedFrozenSet([t]), False)

    def la_not_in_nonterminal(self, nt):
        return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)

    def la_not_in_set(self, lookahead_exclusions):
        if all(len(excl) == 1 for excl in lookahead_exclusions):
            return grammar.LookaheadRule(
                OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
                False)
        raise ValueError("unsupported: lookahead > 1 token, {!r}"
                         .format(lookahead_exclusions))

    def chr(self, t):
        assert t[0] == "<" or t[0] == 'U'
        if t[0] == "<":
            assert t[-1] == ">"
            if t not in ECMASCRIPT_CODE_POINTS:
                raise ValueError("unrecognized character abbreviation {!r}".format(t))
            return ECMASCRIPT_CODE_POINTS[t]
        else:
            assert t[1] == "+"
            return grammar.Literal(chr(int(t[2:], base=16)))


def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
                   single_grammar=True, extensions=[]):
    nt_grammars = {}
    for nt_name, eq, _ in nt_defs:
        if nt_name in nt_grammars:
            raise ValueError(
                "duplicate definitions for nonterminal {!r}"
                .format(nt_name))
        nt_grammars[nt_name] = eq

    # Figure out which grammar we were trying to get (":" for syntactic,
    # "::" for lexical) based on the goal symbols.
    goals = list(goals)
    if len(goals) == 0:
        raise ValueError("no goal nonterminals specified")
    if single_grammar:
        selected_grammars = set(nt_grammars[goal] for goal in goals)
        assert len(selected_grammars) != 0
        if len(selected_grammars) > 1:
            raise ValueError(
                "all goal nonterminals must be part of the same grammar; "
                "got {!r} (matching these grammars: {!r})"
                .format(set(goals), set(selected_grammars)))
        [selected_grammar] = selected_grammars

    terminal_set = set()

    def hack_production(p):
        for i, e in enumerate(p.body):
            if isinstance(e, str) and e[:1] == "`":
                if len(e) < 3 or e[-1:] != "`":
                    raise ValueError(
                        "Unrecognized grammar symbol: {!r} (in {!r})"
                        .format(e, p))
                p[i] = token = e[1:-1]
                terminal_set.add(token)

    nonterminals = {}
    for nt_name, eq, rhs_list_or_lambda in nt_defs:
        if single_grammar and eq != selected_grammar:
            continue

        if isinstance(rhs_list_or_lambda, grammar.NtDef):
            nonterminals[nt_name] = rhs_list_or_lambda
        else:
            rhs_list = rhs_list_or_lambda
            for p in rhs_list:
                if not isinstance(p, grammar.Production):
                    raise ValueError(
                        "invalid grammar: ifdef in non-function-call context")
                hack_production(p)
            if nt_name in nonterminals:
                raise ValueError(
                    "unsupported: multiple definitions for nt " + nt_name)
            nonterminals[nt_name] = rhs_list

    for t in terminal_set:
        if t in nonterminals:
            raise ValueError(
                "grammar contains both a terminal `{}` and nonterminal {}"
                .format(t, t))

    # Add execution modes to generate the various functions needed to handle
    # syntax parsing and full parsing execution modes.
    exec_modes = collections.defaultdict(OrderedSet)
    noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
    token_parser = types.Type("ParserTrait", (
        types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
    ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))

    # Full parsing takes token as input and build an AST.
    exec_modes["full_actions"].extend([token_parser, ast_builder])

    # Syntax parsing takes token as input but skip building the AST.
    # TODO: The syntax parser is commented out for now, as we need something to
    # be produced when we cannot call the AstBuilder for producing the values.

    # No-op parsing is used for the simulator, which is so far used for
    # querying whether we can end the incremental input and lookup if a state
    # can accept some kind of tokens.
    exec_modes["noop_actions"].add(noop_parser)

    # Extensions are using an equivalent of Rust types to define the kind of
    # parsers to be used, this map is used to convert these type names to the
    # various execution modes.
    full_parser = types.Type("FullParser")
    syntax_parser = types.Type("SyntaxParser")
    noop_parser = types.Type("NoopParser")
    type_to_modes = {
        noop_parser: ["noop_actions", "full_actions"],
        syntax_parser: ["full_actions"],
        full_parser: ["full_actions"],
    }

    result = grammar.Grammar(
        nonterminals,
        goal_nts=goals,
        variable_terminals=variable_terminals,
        synthetic_terminals=synthetic_terminals,
        exec_modes=exec_modes,
        type_to_modes=type_to_modes)
    result.patch(extensions)
    return result


def parse_esgrammar(
        text: str,
        *,
        filename: Optional[str] = None,
        extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
        goals: Optional[Iterable[str]] = None,
        terminal_names: Iterable[str] = (),
        synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
        single_grammar: bool = True
) -> grammar.Grammar:
    if not text.endswith("\n\n"):
        # Horrible hack: add a blank line at the end of the document so that
        # the esgrammar grammar can use newlines as delimiters. :-P
        text += "\n"

    terminal_names = frozenset(terminal_names)
    if synthetic_terminals is None:
        synthetic_terminals = {}

    builder = ESGrammarBuilder(terminal_names)
    parser = ESGrammarParser(builder=builder, goal="grammar")
    lexer = ESGrammarLexer(parser, filename=filename)
    lexer.write(text)
    nt_defs = lexer.close()
    grammar_extensions = []
    for ext_filename, start_lineno, content in extensions:
        builder.reset()
        parser = ESGrammarParser(builder=builder, goal="rust_edsl")
        lexer = ESGrammarLexer(parser, filename=ext_filename)
        builder.lexer = lexer
        lexer.start_lineno = start_lineno
        lexer.write(content)
        result = lexer.close()
        grammar_extensions.append(result)

    if goals is None:
        # Default to the first nonterminal in the input.
        goals = [nt_defs[0][0]]

    return finish_grammar(
        nt_defs,
        goals=goals,
        variable_terminals=terminal_names - frozenset(synthetic_terminals),
        synthetic_terminals=synthetic_terminals,
        single_grammar=single_grammar,
        extensions=grammar_extensions)