diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-28 14:29:10 +0000 |
commit | 2aa4a82499d4becd2284cdb482213d541b8804dd (patch) | |
tree | b80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/jsparagus/js_parser/parse_esgrammar.py | |
parent | Initial commit. (diff) | |
download | firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip |
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/jsparagus/js_parser/parse_esgrammar.py')
-rw-r--r-- | third_party/rust/jsparagus/js_parser/parse_esgrammar.py | 545 |
1 files changed, 545 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/js_parser/parse_esgrammar.py b/third_party/rust/jsparagus/js_parser/parse_esgrammar.py new file mode 100644 index 0000000000..efcb640406 --- /dev/null +++ b/third_party/rust/jsparagus/js_parser/parse_esgrammar.py @@ -0,0 +1,545 @@ +"""Parse a grammar written in ECMArkup.""" + +from __future__ import annotations +# mypy: no-implicit-optional + +import os +import collections +from typing import Dict, Iterable, Optional, Tuple + +from jsparagus import parse_pgen, gen, grammar, extension, types +from jsparagus.lexer import LexicalGrammar +from jsparagus.ordered import OrderedSet, OrderedFrozenSet + + +ESGrammarLexer = LexicalGrammar( + # the operators and keywords: + "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; " + "but empty here lookahead no not of one or returns through Some None impl for let", + + NL="\n", + + # any number of colons together + EQ=r':+', + + # terminals of the ES grammar, quoted with backticks + T=r'`[^` \n]+`|```', + + # also terminals, denoting control characters + CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}', + + # nonterminals/types that will be followed by parameters + NTCALL=r'[A-Za-z]\w*(?=[\[<])', + + # nonterminals (also, boolean parameters and type names) + NT=r'[A-Za-z]\w*', + + # nonterminals wrapped in vertical bars for no apparent reason + NTALT=r'\|[A-Z]\w+\|', + + # the spec also gives a few productions names + PRODID=r'#[A-Za-z]\w*', + + # prose not wrapped in square brackets + # To avoid conflict with the `>` token, this is recognized only after a space. + PROSE=r'(?<= )>[^\n]*', + + # prose wrapped in square brackets + WPROSE=r'\[>[^]]*\]', + + # expression denoting a matched terminal or nonterminal + MATCH_REF=r'\$(?:0|[1-9][0-9]*)', + + # the spec also gives a few productions names + RUSTCOMMENT=r'//.*\n', +) + + +ESGrammarParser = gen.compile( + parse_pgen.load_grammar( + os.path.join(os.path.dirname(__file__), "esgrammar.pgen"))) + + +SIGIL_FALSE = '~' +SIGIL_TRUE = '+' + +# Abbreviations for single-character terminals, used in the lexical grammar. +ECMASCRIPT_CODE_POINTS = { + # From <https://tc39.es/ecma262/#table-31> + '<ZWNJ>': grammar.Literal('\u200c'), + '<ZWJ>': grammar.Literal('\u200d'), + '<ZWNBSP>': grammar.Literal('\ufeff'), + + # From <https://tc39.es/ecma262/#table-32> + '<TAB>': grammar.Literal('\t'), + '<VT>': grammar.Literal('\u000b'), + '<FF>': grammar.Literal('\u000c'), + '<SP>': grammar.Literal(' '), + '<NBSP>': grammar.Literal('\u00a0'), + # <ZWNBSP> already defined above + '<USP>': grammar.UnicodeCategory('Zs'), + + # From <https://tc39.es/ecma262/#table-33> + '<LF>': grammar.Literal('\u000a'), + '<CR>': grammar.Literal('\u000d'), + '<LS>': grammar.Literal('\u2028'), + '<PS>': grammar.Literal('\u2028'), +} + + +class ESGrammarBuilder: + def __init__(self, terminal_names): + # Names of terminals that are written as nonterminals in the grammar. + # For example, "BooleanLiteral" is a terminal name when parsing the + # syntactic grammar. + if terminal_names is None: + terminal_names = frozenset() + self.terminal_names = frozenset(terminal_names) + self.reset() + + def reset(self): + self.lexer = None + # This is how full-parsing and lazy-parsing are implemented, using + # different traits. + # + # This field contains the Rust's trait used for calling the method. + # When a CallMethod is generated, it is assumed to be a function of + # this trait. The trait is used by the Rust backend to generate + # multiple backends which are implementing different set of traits. + # Having the trait on the function call is useful as a way to filter + # functions calls at code-generation time. + # + # This field is updated by the `rust_param_impl`, which is used in + # grammar extensions, and visited before producing any CallMethod. + self.method_trait = "AstBuilder" + + def rust_edsl(self, impl, grammar): + return extension.GrammarExtension(impl, grammar, self.lexer.filename) + + def rust_param_impl(self, trait, for_type, param): + self.method_trait = trait + return extension.ImplFor(param, trait, for_type) + + def rust_impl(self, trait, impl_type): + return self.rust_param_impl(trait, impl_type, []) + + def rust_nt_def(self, lhs, rhs_line): + # Right now, only focus on the syntactic grammar, and assume that all + # rules are patching existing grammar production by adding code. + return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line])) + + def rust_rhs_line(self, symbols): + return self.rhs_line(None, symbols, None, None) + + def rust_expr(self, expr): + assert isinstance(expr, grammar.CallMethod) + return expr + + def empty(self): + return [] + + def single(self, x): + return [x] + + def append(self, x, y): + return x + [y] + + def concat(self, x, y): + return x + y + + def blank_line(self): + return [] + + def nt_def_to_list(self, nt_def): + return [nt_def] + + def to_production(self, lhs, i, rhs, is_sole_production): + """Wrap a list of grammar symbols `rhs` in a Production object.""" + body, reducer, condition = rhs + if reducer is None: + reducer = self.default_reducer(lhs, i, body, is_sole_production) + return grammar.Production(body, reducer, condition=condition) + + def default_reducer(self, lhs, i, body, is_sole_production): + assert isinstance(lhs, grammar.Nt) + nt_name = lhs.name + + nargs = sum(1 for e in body if grammar.is_concrete_element(e)) + if is_sole_production: + method_name = nt_name + else: + method_name = '{} {}'.format(nt_name, i) + return self.expr_call(method_name, tuple(range(nargs)), None) + + def needs_asi(self, lhs, p): + """True if p is a production in which ASI can happen.""" + # The purpose of the fake ForLexicalDeclaration production is to have a + # copy of LexicalDeclaration that does not trigger ASI. + # + # Two productions have body == [";"] -- one for EmptyStatement and one + # for ClassMember. Neither should trigger ASI. + # + # The only other semicolons that should not trigger ASI are the ones in + # `for` statement productions, which happen to be exactly those + # semicolons that are not at the end of a production. + return (not (isinstance(lhs, grammar.Nt) + and lhs.name == 'ForLexicalDeclaration') + and len(p.body) > 1 + and p.body[-1] == ';') + + def apply_asi(self, p, reducer_was_autogenerated): + """Return two rules based on p, so that ASI can be applied.""" + assert isinstance(p.reducer, grammar.CallMethod) + + if reducer_was_autogenerated: + # Don't pass the semicolon to the method. + reducer = self.expr_call(p.reducer.method, + p.reducer.args[:-1], + None) + else: + reducer = p.reducer + + # Except for do-while loops, check at runtime that ASI occurs only at + # the end of a line. + if (len(p.body) == 7 + and p.body[0] == 'do' + and p.body[2] == 'while' + and p.body[3] == '(' + and p.body[5] == ')' + and p.body[6] == ';'): + code = "do_while_asi" + else: + code = "asi" + + return [ + # The preferred production, with the semicolon in. + p.copy_with(body=p.body[:], + reducer=reducer), + # The fallback production, performing ASI. + p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)], + reducer=reducer), + ] + + def expand_lexical_rhs(self, rhs): + body, reducer, condition = rhs + out = [] + for e in body: + if isinstance(e, str): + # The terminal symbols of the lexical grammar are characters, so + # add each character of this string as a separate element. + out += [grammar.Literal(ch) for ch in e] + else: + out.append(e) + return [out, reducer, condition] + + def nt_def(self, nt_type, lhs, eq, rhs_list): + has_sole_production = (len(rhs_list) == 1) + production_list = [] + for i, rhs in enumerate(rhs_list): + if eq == ':': + # Syntactic grammar. A hack is needed for ASI. + reducer_was_autogenerated = rhs[1] is None + p = self.to_production(lhs, i, rhs, has_sole_production) + if self.needs_asi(lhs, p): + production_list += self.apply_asi(p, reducer_was_autogenerated) + else: + production_list.append(p) + elif eq == '::': + # Lexical grammar. A hack is needed to replace multicharacter + # terminals like `!==` into sequences of character terminals. + rhs = self.expand_lexical_rhs(rhs) + p = self.to_production(lhs, i, rhs, has_sole_production) + production_list.append(p) + return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type)) + + def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals): + return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals]) + + def nt_lhs_no_params(self, name): + return grammar.Nt(name, ()) + + def nt_lhs_with_params(self, name, params): + return grammar.Nt(name, tuple(params)) + + def simple_type(self, name): + return types.Type(name) + + def lifetime_type(self, name): + return types.Lifetime(name) + + def parameterized_type(self, name, args): + return types.Type(name, tuple(args)) + + def t_list_line(self, terminals): + return terminals + + def terminal(self, t): + assert t[0] == "`" + assert t[-1] == "`" + return t[1:-1] + + def terminal_chr(self, chr): + raise ValueError("FAILED: %r" % chr) + + def rhs_line(self, ifdef, rhs, reducer, _prodid): + return (rhs, reducer, ifdef) + + def rhs_line_prose(self, prose): + return ([prose], None, None) + + def empty_rhs(self): + return [] + + def expr_match_ref(self, token): + assert token.startswith('$') + return int(token[1:]) + + def expr_call(self, method, args, fallible): + # NOTE: Currently "AstBuilder" functions are made fallible using the + # fallible_methods taken from some Rust code which extract this + # information to produce a JSON file. + if self.method_trait == "AstBuilder": + fallible = None + return grammar.CallMethod(method, args or (), types.Type(self.method_trait), + fallible is not None) + + def expr_some(self, expr): + return grammar.Some(expr) + + def expr_none(self): + return None + + def ifdef(self, value, nt): + return nt, value + + def optional(self, nt): + return grammar.Optional(nt) + + def but_not(self, nt, exclusion): + _, exclusion = exclusion + return grammar.Exclude(nt, [exclusion]) + # return ('-', nt, exclusion) + + def but_not_one_of(self, nt, exclusion_list): + exclusion_list = [exclusion for _, exclusion in exclusion_list] + return grammar.Exclude(nt, exclusion_list) + # return ('-', nt, exclusion_list) + + def no_line_terminator_here(self, lt): + if lt not in ('LineTerminator', '|LineTerminator|'): + raise ValueError("unrecognized directive " + repr("[no " + lt + " here]")) + return grammar.NoLineTerminatorHere + + def nonterminal(self, name): + if name in self.terminal_names: + return name + return grammar.Nt(name, ()) + + def nonterminal_apply(self, name, args): + if name in self.terminal_names: + raise ValueError("parameters applied to terminal {!r}".format(name)) + if len(set(k for k, expr in args)) != len(args): + raise ValueError("parameter passed multiple times") + return grammar.Nt(name, tuple(args)) + + def arg_expr(self, sigil, argname): + if sigil == '?': + return (argname, grammar.Var(argname)) + else: + return (argname, sigil) + + def sigil_false(self): + return False + + def sigil_true(self): + return True + + def exclusion_terminal(self, t): + return ("t", t) + + def exclusion_nonterminal(self, nt): + return ("nt", nt) + + def exclusion_chr_range(self, c1, c2): + return ("range", c1, c2) + + def la_eq(self, t): + return grammar.LookaheadRule(OrderedFrozenSet([t]), True) + + def la_ne(self, t): + return grammar.LookaheadRule(OrderedFrozenSet([t]), False) + + def la_not_in_nonterminal(self, nt): + return grammar.LookaheadRule(OrderedFrozenSet([nt]), False) + + def la_not_in_set(self, lookahead_exclusions): + if all(len(excl) == 1 for excl in lookahead_exclusions): + return grammar.LookaheadRule( + OrderedFrozenSet(excl[0] for excl in lookahead_exclusions), + False) + raise ValueError("unsupported: lookahead > 1 token, {!r}" + .format(lookahead_exclusions)) + + def chr(self, t): + assert t[0] == "<" or t[0] == 'U' + if t[0] == "<": + assert t[-1] == ">" + if t not in ECMASCRIPT_CODE_POINTS: + raise ValueError("unrecognized character abbreviation {!r}".format(t)) + return ECMASCRIPT_CODE_POINTS[t] + else: + assert t[1] == "+" + return grammar.Literal(chr(int(t[2:], base=16))) + + +def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals, + single_grammar=True, extensions=[]): + nt_grammars = {} + for nt_name, eq, _ in nt_defs: + if nt_name in nt_grammars: + raise ValueError( + "duplicate definitions for nonterminal {!r}" + .format(nt_name)) + nt_grammars[nt_name] = eq + + # Figure out which grammar we were trying to get (":" for syntactic, + # "::" for lexical) based on the goal symbols. + goals = list(goals) + if len(goals) == 0: + raise ValueError("no goal nonterminals specified") + if single_grammar: + selected_grammars = set(nt_grammars[goal] for goal in goals) + assert len(selected_grammars) != 0 + if len(selected_grammars) > 1: + raise ValueError( + "all goal nonterminals must be part of the same grammar; " + "got {!r} (matching these grammars: {!r})" + .format(set(goals), set(selected_grammars))) + [selected_grammar] = selected_grammars + + terminal_set = set() + + def hack_production(p): + for i, e in enumerate(p.body): + if isinstance(e, str) and e[:1] == "`": + if len(e) < 3 or e[-1:] != "`": + raise ValueError( + "Unrecognized grammar symbol: {!r} (in {!r})" + .format(e, p)) + p[i] = token = e[1:-1] + terminal_set.add(token) + + nonterminals = {} + for nt_name, eq, rhs_list_or_lambda in nt_defs: + if single_grammar and eq != selected_grammar: + continue + + if isinstance(rhs_list_or_lambda, grammar.NtDef): + nonterminals[nt_name] = rhs_list_or_lambda + else: + rhs_list = rhs_list_or_lambda + for p in rhs_list: + if not isinstance(p, grammar.Production): + raise ValueError( + "invalid grammar: ifdef in non-function-call context") + hack_production(p) + if nt_name in nonterminals: + raise ValueError( + "unsupported: multiple definitions for nt " + nt_name) + nonterminals[nt_name] = rhs_list + + for t in terminal_set: + if t in nonterminals: + raise ValueError( + "grammar contains both a terminal `{}` and nonterminal {}" + .format(t, t)) + + # Add execution modes to generate the various functions needed to handle + # syntax parsing and full parsing execution modes. + exec_modes = collections.defaultdict(OrderedSet) + noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType)) + token_parser = types.Type("ParserTrait", ( + types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),)))) + ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),)) + + # Full parsing takes token as input and build an AST. + exec_modes["full_actions"].extend([token_parser, ast_builder]) + + # Syntax parsing takes token as input but skip building the AST. + # TODO: The syntax parser is commented out for now, as we need something to + # be produced when we cannot call the AstBuilder for producing the values. + + # No-op parsing is used for the simulator, which is so far used for + # querying whether we can end the incremental input and lookup if a state + # can accept some kind of tokens. + exec_modes["noop_actions"].add(noop_parser) + + # Extensions are using an equivalent of Rust types to define the kind of + # parsers to be used, this map is used to convert these type names to the + # various execution modes. + full_parser = types.Type("FullParser") + syntax_parser = types.Type("SyntaxParser") + noop_parser = types.Type("NoopParser") + type_to_modes = { + noop_parser: ["noop_actions", "full_actions"], + syntax_parser: ["full_actions"], + full_parser: ["full_actions"], + } + + result = grammar.Grammar( + nonterminals, + goal_nts=goals, + variable_terminals=variable_terminals, + synthetic_terminals=synthetic_terminals, + exec_modes=exec_modes, + type_to_modes=type_to_modes) + result.patch(extensions) + return result + + +def parse_esgrammar( + text: str, + *, + filename: Optional[str] = None, + extensions: Iterable[Tuple[os.PathLike, int, str]] = (), + goals: Optional[Iterable[str]] = None, + terminal_names: Iterable[str] = (), + synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None, + single_grammar: bool = True +) -> grammar.Grammar: + if not text.endswith("\n\n"): + # Horrible hack: add a blank line at the end of the document so that + # the esgrammar grammar can use newlines as delimiters. :-P + text += "\n" + + terminal_names = frozenset(terminal_names) + if synthetic_terminals is None: + synthetic_terminals = {} + + builder = ESGrammarBuilder(terminal_names) + parser = ESGrammarParser(builder=builder, goal="grammar") + lexer = ESGrammarLexer(parser, filename=filename) + lexer.write(text) + nt_defs = lexer.close() + grammar_extensions = [] + for ext_filename, start_lineno, content in extensions: + builder.reset() + parser = ESGrammarParser(builder=builder, goal="rust_edsl") + lexer = ESGrammarLexer(parser, filename=ext_filename) + builder.lexer = lexer + lexer.start_lineno = start_lineno + lexer.write(content) + result = lexer.close() + grammar_extensions.append(result) + + if goals is None: + # Default to the first nonterminal in the input. + goals = [nt_defs[0][0]] + + return finish_grammar( + nt_defs, + goals=goals, + variable_terminals=terminal_names - frozenset(synthetic_terminals), + synthetic_terminals=synthetic_terminals, + single_grammar=single_grammar, + extensions=grammar_extensions) |