summaryrefslogtreecommitdiffstats
path: root/third_party/rust/jsparagus/js_parser/parse_esgrammar.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/jsparagus/js_parser/parse_esgrammar.py
parentInitial commit. (diff)
downloadfirefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/jsparagus/js_parser/parse_esgrammar.py')
-rw-r--r--third_party/rust/jsparagus/js_parser/parse_esgrammar.py545
1 files changed, 545 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/js_parser/parse_esgrammar.py b/third_party/rust/jsparagus/js_parser/parse_esgrammar.py
new file mode 100644
index 0000000000..efcb640406
--- /dev/null
+++ b/third_party/rust/jsparagus/js_parser/parse_esgrammar.py
@@ -0,0 +1,545 @@
+"""Parse a grammar written in ECMArkup."""
+
+from __future__ import annotations
+# mypy: no-implicit-optional
+
+import os
+import collections
+from typing import Dict, Iterable, Optional, Tuple
+
+from jsparagus import parse_pgen, gen, grammar, extension, types
+from jsparagus.lexer import LexicalGrammar
+from jsparagus.ordered import OrderedSet, OrderedFrozenSet
+
+
+ESGrammarLexer = LexicalGrammar(
+ # the operators and keywords:
+ "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
+ "but empty here lookahead no not of one or returns through Some None impl for let",
+
+ NL="\n",
+
+ # any number of colons together
+ EQ=r':+',
+
+ # terminals of the ES grammar, quoted with backticks
+ T=r'`[^` \n]+`|```',
+
+ # also terminals, denoting control characters
+ CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',
+
+ # nonterminals/types that will be followed by parameters
+ NTCALL=r'[A-Za-z]\w*(?=[\[<])',
+
+ # nonterminals (also, boolean parameters and type names)
+ NT=r'[A-Za-z]\w*',
+
+ # nonterminals wrapped in vertical bars for no apparent reason
+ NTALT=r'\|[A-Z]\w+\|',
+
+ # the spec also gives a few productions names
+ PRODID=r'#[A-Za-z]\w*',
+
+ # prose not wrapped in square brackets
+ # To avoid conflict with the `>` token, this is recognized only after a space.
+ PROSE=r'(?<= )>[^\n]*',
+
+ # prose wrapped in square brackets
+ WPROSE=r'\[>[^]]*\]',
+
+ # expression denoting a matched terminal or nonterminal
+ MATCH_REF=r'\$(?:0|[1-9][0-9]*)',
+
+ # the spec also gives a few productions names
+ RUSTCOMMENT=r'//.*\n',
+)
+
+
+ESGrammarParser = gen.compile(
+ parse_pgen.load_grammar(
+ os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))
+
+
+SIGIL_FALSE = '~'
+SIGIL_TRUE = '+'
+
+# Abbreviations for single-character terminals, used in the lexical grammar.
+ECMASCRIPT_CODE_POINTS = {
+ # From <https://tc39.es/ecma262/#table-31>
+ '<ZWNJ>': grammar.Literal('\u200c'),
+ '<ZWJ>': grammar.Literal('\u200d'),
+ '<ZWNBSP>': grammar.Literal('\ufeff'),
+
+ # From <https://tc39.es/ecma262/#table-32>
+ '<TAB>': grammar.Literal('\t'),
+ '<VT>': grammar.Literal('\u000b'),
+ '<FF>': grammar.Literal('\u000c'),
+ '<SP>': grammar.Literal(' '),
+ '<NBSP>': grammar.Literal('\u00a0'),
+ # <ZWNBSP> already defined above
+ '<USP>': grammar.UnicodeCategory('Zs'),
+
+ # From <https://tc39.es/ecma262/#table-33>
+ '<LF>': grammar.Literal('\u000a'),
+ '<CR>': grammar.Literal('\u000d'),
+ '<LS>': grammar.Literal('\u2028'),
+ '<PS>': grammar.Literal('\u2028'),
+}
+
+
+class ESGrammarBuilder:
+ def __init__(self, terminal_names):
+ # Names of terminals that are written as nonterminals in the grammar.
+ # For example, "BooleanLiteral" is a terminal name when parsing the
+ # syntactic grammar.
+ if terminal_names is None:
+ terminal_names = frozenset()
+ self.terminal_names = frozenset(terminal_names)
+ self.reset()
+
+ def reset(self):
+ self.lexer = None
+ # This is how full-parsing and lazy-parsing are implemented, using
+ # different traits.
+ #
+ # This field contains the Rust's trait used for calling the method.
+ # When a CallMethod is generated, it is assumed to be a function of
+ # this trait. The trait is used by the Rust backend to generate
+ # multiple backends which are implementing different set of traits.
+ # Having the trait on the function call is useful as a way to filter
+ # functions calls at code-generation time.
+ #
+ # This field is updated by the `rust_param_impl`, which is used in
+ # grammar extensions, and visited before producing any CallMethod.
+ self.method_trait = "AstBuilder"
+
+ def rust_edsl(self, impl, grammar):
+ return extension.GrammarExtension(impl, grammar, self.lexer.filename)
+
+ def rust_param_impl(self, trait, for_type, param):
+ self.method_trait = trait
+ return extension.ImplFor(param, trait, for_type)
+
+ def rust_impl(self, trait, impl_type):
+ return self.rust_param_impl(trait, impl_type, [])
+
+ def rust_nt_def(self, lhs, rhs_line):
+ # Right now, only focus on the syntactic grammar, and assume that all
+ # rules are patching existing grammar production by adding code.
+ return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))
+
+ def rust_rhs_line(self, symbols):
+ return self.rhs_line(None, symbols, None, None)
+
+ def rust_expr(self, expr):
+ assert isinstance(expr, grammar.CallMethod)
+ return expr
+
+ def empty(self):
+ return []
+
+ def single(self, x):
+ return [x]
+
+ def append(self, x, y):
+ return x + [y]
+
+ def concat(self, x, y):
+ return x + y
+
+ def blank_line(self):
+ return []
+
+ def nt_def_to_list(self, nt_def):
+ return [nt_def]
+
+ def to_production(self, lhs, i, rhs, is_sole_production):
+ """Wrap a list of grammar symbols `rhs` in a Production object."""
+ body, reducer, condition = rhs
+ if reducer is None:
+ reducer = self.default_reducer(lhs, i, body, is_sole_production)
+ return grammar.Production(body, reducer, condition=condition)
+
+ def default_reducer(self, lhs, i, body, is_sole_production):
+ assert isinstance(lhs, grammar.Nt)
+ nt_name = lhs.name
+
+ nargs = sum(1 for e in body if grammar.is_concrete_element(e))
+ if is_sole_production:
+ method_name = nt_name
+ else:
+ method_name = '{} {}'.format(nt_name, i)
+ return self.expr_call(method_name, tuple(range(nargs)), None)
+
+ def needs_asi(self, lhs, p):
+ """True if p is a production in which ASI can happen."""
+ # The purpose of the fake ForLexicalDeclaration production is to have a
+ # copy of LexicalDeclaration that does not trigger ASI.
+ #
+ # Two productions have body == [";"] -- one for EmptyStatement and one
+ # for ClassMember. Neither should trigger ASI.
+ #
+ # The only other semicolons that should not trigger ASI are the ones in
+ # `for` statement productions, which happen to be exactly those
+ # semicolons that are not at the end of a production.
+ return (not (isinstance(lhs, grammar.Nt)
+ and lhs.name == 'ForLexicalDeclaration')
+ and len(p.body) > 1
+ and p.body[-1] == ';')
+
+ def apply_asi(self, p, reducer_was_autogenerated):
+ """Return two rules based on p, so that ASI can be applied."""
+ assert isinstance(p.reducer, grammar.CallMethod)
+
+ if reducer_was_autogenerated:
+ # Don't pass the semicolon to the method.
+ reducer = self.expr_call(p.reducer.method,
+ p.reducer.args[:-1],
+ None)
+ else:
+ reducer = p.reducer
+
+ # Except for do-while loops, check at runtime that ASI occurs only at
+ # the end of a line.
+ if (len(p.body) == 7
+ and p.body[0] == 'do'
+ and p.body[2] == 'while'
+ and p.body[3] == '('
+ and p.body[5] == ')'
+ and p.body[6] == ';'):
+ code = "do_while_asi"
+ else:
+ code = "asi"
+
+ return [
+ # The preferred production, with the semicolon in.
+ p.copy_with(body=p.body[:],
+ reducer=reducer),
+ # The fallback production, performing ASI.
+ p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
+ reducer=reducer),
+ ]
+
+ def expand_lexical_rhs(self, rhs):
+ body, reducer, condition = rhs
+ out = []
+ for e in body:
+ if isinstance(e, str):
+ # The terminal symbols of the lexical grammar are characters, so
+ # add each character of this string as a separate element.
+ out += [grammar.Literal(ch) for ch in e]
+ else:
+ out.append(e)
+ return [out, reducer, condition]
+
+ def nt_def(self, nt_type, lhs, eq, rhs_list):
+ has_sole_production = (len(rhs_list) == 1)
+ production_list = []
+ for i, rhs in enumerate(rhs_list):
+ if eq == ':':
+ # Syntactic grammar. A hack is needed for ASI.
+ reducer_was_autogenerated = rhs[1] is None
+ p = self.to_production(lhs, i, rhs, has_sole_production)
+ if self.needs_asi(lhs, p):
+ production_list += self.apply_asi(p, reducer_was_autogenerated)
+ else:
+ production_list.append(p)
+ elif eq == '::':
+ # Lexical grammar. A hack is needed to replace multicharacter
+ # terminals like `!==` into sequences of character terminals.
+ rhs = self.expand_lexical_rhs(rhs)
+ p = self.to_production(lhs, i, rhs, has_sole_production)
+ production_list.append(p)
+ return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))
+
+ def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
+ return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])
+
+ def nt_lhs_no_params(self, name):
+ return grammar.Nt(name, ())
+
+ def nt_lhs_with_params(self, name, params):
+ return grammar.Nt(name, tuple(params))
+
+ def simple_type(self, name):
+ return types.Type(name)
+
+ def lifetime_type(self, name):
+ return types.Lifetime(name)
+
+ def parameterized_type(self, name, args):
+ return types.Type(name, tuple(args))
+
+ def t_list_line(self, terminals):
+ return terminals
+
+ def terminal(self, t):
+ assert t[0] == "`"
+ assert t[-1] == "`"
+ return t[1:-1]
+
+ def terminal_chr(self, chr):
+ raise ValueError("FAILED: %r" % chr)
+
+ def rhs_line(self, ifdef, rhs, reducer, _prodid):
+ return (rhs, reducer, ifdef)
+
+ def rhs_line_prose(self, prose):
+ return ([prose], None, None)
+
+ def empty_rhs(self):
+ return []
+
+ def expr_match_ref(self, token):
+ assert token.startswith('$')
+ return int(token[1:])
+
+ def expr_call(self, method, args, fallible):
+ # NOTE: Currently "AstBuilder" functions are made fallible using the
+ # fallible_methods taken from some Rust code which extract this
+ # information to produce a JSON file.
+ if self.method_trait == "AstBuilder":
+ fallible = None
+ return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
+ fallible is not None)
+
+ def expr_some(self, expr):
+ return grammar.Some(expr)
+
+ def expr_none(self):
+ return None
+
+ def ifdef(self, value, nt):
+ return nt, value
+
+ def optional(self, nt):
+ return grammar.Optional(nt)
+
+ def but_not(self, nt, exclusion):
+ _, exclusion = exclusion
+ return grammar.Exclude(nt, [exclusion])
+ # return ('-', nt, exclusion)
+
+ def but_not_one_of(self, nt, exclusion_list):
+ exclusion_list = [exclusion for _, exclusion in exclusion_list]
+ return grammar.Exclude(nt, exclusion_list)
+ # return ('-', nt, exclusion_list)
+
+ def no_line_terminator_here(self, lt):
+ if lt not in ('LineTerminator', '|LineTerminator|'):
+ raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
+ return grammar.NoLineTerminatorHere
+
+ def nonterminal(self, name):
+ if name in self.terminal_names:
+ return name
+ return grammar.Nt(name, ())
+
+ def nonterminal_apply(self, name, args):
+ if name in self.terminal_names:
+ raise ValueError("parameters applied to terminal {!r}".format(name))
+ if len(set(k for k, expr in args)) != len(args):
+ raise ValueError("parameter passed multiple times")
+ return grammar.Nt(name, tuple(args))
+
+ def arg_expr(self, sigil, argname):
+ if sigil == '?':
+ return (argname, grammar.Var(argname))
+ else:
+ return (argname, sigil)
+
+ def sigil_false(self):
+ return False
+
+ def sigil_true(self):
+ return True
+
+ def exclusion_terminal(self, t):
+ return ("t", t)
+
+ def exclusion_nonterminal(self, nt):
+ return ("nt", nt)
+
+ def exclusion_chr_range(self, c1, c2):
+ return ("range", c1, c2)
+
+ def la_eq(self, t):
+ return grammar.LookaheadRule(OrderedFrozenSet([t]), True)
+
+ def la_ne(self, t):
+ return grammar.LookaheadRule(OrderedFrozenSet([t]), False)
+
+ def la_not_in_nonterminal(self, nt):
+ return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)
+
+ def la_not_in_set(self, lookahead_exclusions):
+ if all(len(excl) == 1 for excl in lookahead_exclusions):
+ return grammar.LookaheadRule(
+ OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
+ False)
+ raise ValueError("unsupported: lookahead > 1 token, {!r}"
+ .format(lookahead_exclusions))
+
+ def chr(self, t):
+ assert t[0] == "<" or t[0] == 'U'
+ if t[0] == "<":
+ assert t[-1] == ">"
+ if t not in ECMASCRIPT_CODE_POINTS:
+ raise ValueError("unrecognized character abbreviation {!r}".format(t))
+ return ECMASCRIPT_CODE_POINTS[t]
+ else:
+ assert t[1] == "+"
+ return grammar.Literal(chr(int(t[2:], base=16)))
+
+
+def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
+ single_grammar=True, extensions=[]):
+ nt_grammars = {}
+ for nt_name, eq, _ in nt_defs:
+ if nt_name in nt_grammars:
+ raise ValueError(
+ "duplicate definitions for nonterminal {!r}"
+ .format(nt_name))
+ nt_grammars[nt_name] = eq
+
+ # Figure out which grammar we were trying to get (":" for syntactic,
+ # "::" for lexical) based on the goal symbols.
+ goals = list(goals)
+ if len(goals) == 0:
+ raise ValueError("no goal nonterminals specified")
+ if single_grammar:
+ selected_grammars = set(nt_grammars[goal] for goal in goals)
+ assert len(selected_grammars) != 0
+ if len(selected_grammars) > 1:
+ raise ValueError(
+ "all goal nonterminals must be part of the same grammar; "
+ "got {!r} (matching these grammars: {!r})"
+ .format(set(goals), set(selected_grammars)))
+ [selected_grammar] = selected_grammars
+
+ terminal_set = set()
+
+ def hack_production(p):
+ for i, e in enumerate(p.body):
+ if isinstance(e, str) and e[:1] == "`":
+ if len(e) < 3 or e[-1:] != "`":
+ raise ValueError(
+ "Unrecognized grammar symbol: {!r} (in {!r})"
+ .format(e, p))
+ p[i] = token = e[1:-1]
+ terminal_set.add(token)
+
+ nonterminals = {}
+ for nt_name, eq, rhs_list_or_lambda in nt_defs:
+ if single_grammar and eq != selected_grammar:
+ continue
+
+ if isinstance(rhs_list_or_lambda, grammar.NtDef):
+ nonterminals[nt_name] = rhs_list_or_lambda
+ else:
+ rhs_list = rhs_list_or_lambda
+ for p in rhs_list:
+ if not isinstance(p, grammar.Production):
+ raise ValueError(
+ "invalid grammar: ifdef in non-function-call context")
+ hack_production(p)
+ if nt_name in nonterminals:
+ raise ValueError(
+ "unsupported: multiple definitions for nt " + nt_name)
+ nonterminals[nt_name] = rhs_list
+
+ for t in terminal_set:
+ if t in nonterminals:
+ raise ValueError(
+ "grammar contains both a terminal `{}` and nonterminal {}"
+ .format(t, t))
+
+ # Add execution modes to generate the various functions needed to handle
+ # syntax parsing and full parsing execution modes.
+ exec_modes = collections.defaultdict(OrderedSet)
+ noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
+ token_parser = types.Type("ParserTrait", (
+ types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
+ ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))
+
+ # Full parsing takes token as input and build an AST.
+ exec_modes["full_actions"].extend([token_parser, ast_builder])
+
+ # Syntax parsing takes token as input but skip building the AST.
+ # TODO: The syntax parser is commented out for now, as we need something to
+ # be produced when we cannot call the AstBuilder for producing the values.
+
+ # No-op parsing is used for the simulator, which is so far used for
+ # querying whether we can end the incremental input and lookup if a state
+ # can accept some kind of tokens.
+ exec_modes["noop_actions"].add(noop_parser)
+
+ # Extensions are using an equivalent of Rust types to define the kind of
+ # parsers to be used, this map is used to convert these type names to the
+ # various execution modes.
+ full_parser = types.Type("FullParser")
+ syntax_parser = types.Type("SyntaxParser")
+ noop_parser = types.Type("NoopParser")
+ type_to_modes = {
+ noop_parser: ["noop_actions", "full_actions"],
+ syntax_parser: ["full_actions"],
+ full_parser: ["full_actions"],
+ }
+
+ result = grammar.Grammar(
+ nonterminals,
+ goal_nts=goals,
+ variable_terminals=variable_terminals,
+ synthetic_terminals=synthetic_terminals,
+ exec_modes=exec_modes,
+ type_to_modes=type_to_modes)
+ result.patch(extensions)
+ return result
+
+
+def parse_esgrammar(
+ text: str,
+ *,
+ filename: Optional[str] = None,
+ extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
+ goals: Optional[Iterable[str]] = None,
+ terminal_names: Iterable[str] = (),
+ synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
+ single_grammar: bool = True
+) -> grammar.Grammar:
+ if not text.endswith("\n\n"):
+ # Horrible hack: add a blank line at the end of the document so that
+ # the esgrammar grammar can use newlines as delimiters. :-P
+ text += "\n"
+
+ terminal_names = frozenset(terminal_names)
+ if synthetic_terminals is None:
+ synthetic_terminals = {}
+
+ builder = ESGrammarBuilder(terminal_names)
+ parser = ESGrammarParser(builder=builder, goal="grammar")
+ lexer = ESGrammarLexer(parser, filename=filename)
+ lexer.write(text)
+ nt_defs = lexer.close()
+ grammar_extensions = []
+ for ext_filename, start_lineno, content in extensions:
+ builder.reset()
+ parser = ESGrammarParser(builder=builder, goal="rust_edsl")
+ lexer = ESGrammarLexer(parser, filename=ext_filename)
+ builder.lexer = lexer
+ lexer.start_lineno = start_lineno
+ lexer.write(content)
+ result = lexer.close()
+ grammar_extensions.append(result)
+
+ if goals is None:
+ # Default to the first nonterminal in the input.
+ goals = [nt_defs[0][0]]
+
+ return finish_grammar(
+ nt_defs,
+ goals=goals,
+ variable_terminals=terminal_names - frozenset(synthetic_terminals),
+ synthetic_terminals=synthetic_terminals,
+ single_grammar=single_grammar,
+ extensions=grammar_extensions)