"""Parse a grammar written in ECMArkup.""" from __future__ import annotations # mypy: no-implicit-optional import os import collections from typing import Dict, Iterable, Optional, Tuple from jsparagus import parse_pgen, gen, grammar, extension, types from jsparagus.lexer import LexicalGrammar from jsparagus.ordered import OrderedSet, OrderedFrozenSet ESGrammarLexer = LexicalGrammar( # the operators and keywords: "[ ] { } , ~ + ? ( ) @ < > ' ; " "but empty here lookahead no not of one or returns through Some None impl for let", NL="\n", # any number of colons together EQ=r':+', # terminals of the ES grammar, quoted with backticks T=r'`[^` \n]+`|```', # also terminals, denoting control characters CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}', # nonterminals/types that will be followed by parameters NTCALL=r'[A-Za-z]\w*(?=[\[<])', # nonterminals (also, boolean parameters and type names) NT=r'[A-Za-z]\w*', # nonterminals wrapped in vertical bars for no apparent reason NTALT=r'\|[A-Z]\w+\|', # the spec also gives a few productions names PRODID=r'#[A-Za-z]\w*', # prose not wrapped in square brackets # To avoid conflict with the `>` token, this is recognized only after a space. PROSE=r'(?<= )>[^\n]*', # prose wrapped in square brackets WPROSE=r'\[>[^]]*\]', # expression denoting a matched terminal or nonterminal MATCH_REF=r'\$(?:0|[1-9][0-9]*)', # the spec also gives a few productions names RUSTCOMMENT=r'//.*\n', ) ESGrammarParser = gen.compile( parse_pgen.load_grammar( os.path.join(os.path.dirname(__file__), "esgrammar.pgen"))) SIGIL_FALSE = '~' SIGIL_TRUE = '+' # Abbreviations for single-character terminals, used in the lexical grammar. ECMASCRIPT_CODE_POINTS = { # From '': grammar.Literal('\u200c'), '': grammar.Literal('\u200d'), '': grammar.Literal('\ufeff'), # From '': grammar.Literal('\t'), '': grammar.Literal('\u000b'), '': grammar.Literal('\u000c'), '': grammar.Literal(' '), '': grammar.Literal('\u00a0'), # already defined above '': grammar.UnicodeCategory('Zs'), # From '': grammar.Literal('\u000a'), '': grammar.Literal('\u000d'), '': grammar.Literal('\u2028'), '': grammar.Literal('\u2028'), } class ESGrammarBuilder: def __init__(self, terminal_names): # Names of terminals that are written as nonterminals in the grammar. # For example, "BooleanLiteral" is a terminal name when parsing the # syntactic grammar. if terminal_names is None: terminal_names = frozenset() self.terminal_names = frozenset(terminal_names) self.reset() def reset(self): self.lexer = None # This is how full-parsing and lazy-parsing are implemented, using # different traits. # # This field contains the Rust's trait used for calling the method. # When a CallMethod is generated, it is assumed to be a function of # this trait. The trait is used by the Rust backend to generate # multiple backends which are implementing different set of traits. # Having the trait on the function call is useful as a way to filter # functions calls at code-generation time. # # This field is updated by the `rust_param_impl`, which is used in # grammar extensions, and visited before producing any CallMethod. self.method_trait = "AstBuilder" def rust_edsl(self, impl, grammar): return extension.GrammarExtension(impl, grammar, self.lexer.filename) def rust_param_impl(self, trait, for_type, param): self.method_trait = trait return extension.ImplFor(param, trait, for_type) def rust_impl(self, trait, impl_type): return self.rust_param_impl(trait, impl_type, []) def rust_nt_def(self, lhs, rhs_line): # Right now, only focus on the syntactic grammar, and assume that all # rules are patching existing grammar production by adding code. return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line])) def rust_rhs_line(self, symbols): return self.rhs_line(None, symbols, None, None) def rust_expr(self, expr): assert isinstance(expr, grammar.CallMethod) return expr def empty(self): return [] def single(self, x): return [x] def append(self, x, y): return x + [y] def concat(self, x, y): return x + y def blank_line(self): return [] def nt_def_to_list(self, nt_def): return [nt_def] def to_production(self, lhs, i, rhs, is_sole_production): """Wrap a list of grammar symbols `rhs` in a Production object.""" body, reducer, condition = rhs if reducer is None: reducer = self.default_reducer(lhs, i, body, is_sole_production) return grammar.Production(body, reducer, condition=condition) def default_reducer(self, lhs, i, body, is_sole_production): assert isinstance(lhs, grammar.Nt) nt_name = lhs.name nargs = sum(1 for e in body if grammar.is_concrete_element(e)) if is_sole_production: method_name = nt_name else: method_name = '{} {}'.format(nt_name, i) return self.expr_call(method_name, tuple(range(nargs)), None) def needs_asi(self, lhs, p): """True if p is a production in which ASI can happen.""" # The purpose of the fake ForLexicalDeclaration production is to have a # copy of LexicalDeclaration that does not trigger ASI. # # Two productions have body == [";"] -- one for EmptyStatement and one # for ClassMember. Neither should trigger ASI. # # The only other semicolons that should not trigger ASI are the ones in # `for` statement productions, which happen to be exactly those # semicolons that are not at the end of a production. return (not (isinstance(lhs, grammar.Nt) and lhs.name == 'ForLexicalDeclaration') and len(p.body) > 1 and p.body[-1] == ';') def apply_asi(self, p, reducer_was_autogenerated): """Return two rules based on p, so that ASI can be applied.""" assert isinstance(p.reducer, grammar.CallMethod) if reducer_was_autogenerated: # Don't pass the semicolon to the method. reducer = self.expr_call(p.reducer.method, p.reducer.args[:-1], None) else: reducer = p.reducer # Except for do-while loops, check at runtime that ASI occurs only at # the end of a line. if (len(p.body) == 7 and p.body[0] == 'do' and p.body[2] == 'while' and p.body[3] == '(' and p.body[5] == ')' and p.body[6] == ';'): code = "do_while_asi" else: code = "asi" return [ # The preferred production, with the semicolon in. p.copy_with(body=p.body[:], reducer=reducer), # The fallback production, performing ASI. p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)], reducer=reducer), ] def expand_lexical_rhs(self, rhs): body, reducer, condition = rhs out = [] for e in body: if isinstance(e, str): # The terminal symbols of the lexical grammar are characters, so # add each character of this string as a separate element. out += [grammar.Literal(ch) for ch in e] else: out.append(e) return [out, reducer, condition] def nt_def(self, nt_type, lhs, eq, rhs_list): has_sole_production = (len(rhs_list) == 1) production_list = [] for i, rhs in enumerate(rhs_list): if eq == ':': # Syntactic grammar. A hack is needed for ASI. reducer_was_autogenerated = rhs[1] is None p = self.to_production(lhs, i, rhs, has_sole_production) if self.needs_asi(lhs, p): production_list += self.apply_asi(p, reducer_was_autogenerated) else: production_list.append(p) elif eq == '::': # Lexical grammar. A hack is needed to replace multicharacter # terminals like `!==` into sequences of character terminals. rhs = self.expand_lexical_rhs(rhs) p = self.to_production(lhs, i, rhs, has_sole_production) production_list.append(p) return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type)) def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals): return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals]) def nt_lhs_no_params(self, name): return grammar.Nt(name, ()) def nt_lhs_with_params(self, name, params): return grammar.Nt(name, tuple(params)) def simple_type(self, name): return types.Type(name) def lifetime_type(self, name): return types.Lifetime(name) def parameterized_type(self, name, args): return types.Type(name, tuple(args)) def t_list_line(self, terminals): return terminals def terminal(self, t): assert t[0] == "`" assert t[-1] == "`" return t[1:-1] def terminal_chr(self, chr): raise ValueError("FAILED: %r" % chr) def rhs_line(self, ifdef, rhs, reducer, _prodid): return (rhs, reducer, ifdef) def rhs_line_prose(self, prose): return ([prose], None, None) def empty_rhs(self): return [] def expr_match_ref(self, token): assert token.startswith('$') return int(token[1:]) def expr_call(self, method, args, fallible): # NOTE: Currently "AstBuilder" functions are made fallible using the # fallible_methods taken from some Rust code which extract this # information to produce a JSON file. if self.method_trait == "AstBuilder": fallible = None return grammar.CallMethod(method, args or (), types.Type(self.method_trait), fallible is not None) def expr_some(self, expr): return grammar.Some(expr) def expr_none(self): return None def ifdef(self, value, nt): return nt, value def optional(self, nt): return grammar.Optional(nt) def but_not(self, nt, exclusion): _, exclusion = exclusion return grammar.Exclude(nt, [exclusion]) # return ('-', nt, exclusion) def but_not_one_of(self, nt, exclusion_list): exclusion_list = [exclusion for _, exclusion in exclusion_list] return grammar.Exclude(nt, exclusion_list) # return ('-', nt, exclusion_list) def no_line_terminator_here(self, lt): if lt not in ('LineTerminator', '|LineTerminator|'): raise ValueError("unrecognized directive " + repr("[no " + lt + " here]")) return grammar.NoLineTerminatorHere def nonterminal(self, name): if name in self.terminal_names: return name return grammar.Nt(name, ()) def nonterminal_apply(self, name, args): if name in self.terminal_names: raise ValueError("parameters applied to terminal {!r}".format(name)) if len(set(k for k, expr in args)) != len(args): raise ValueError("parameter passed multiple times") return grammar.Nt(name, tuple(args)) def arg_expr(self, sigil, argname): if sigil == '?': return (argname, grammar.Var(argname)) else: return (argname, sigil) def sigil_false(self): return False def sigil_true(self): return True def exclusion_terminal(self, t): return ("t", t) def exclusion_nonterminal(self, nt): return ("nt", nt) def exclusion_chr_range(self, c1, c2): return ("range", c1, c2) def la_eq(self, t): return grammar.LookaheadRule(OrderedFrozenSet([t]), True) def la_ne(self, t): return grammar.LookaheadRule(OrderedFrozenSet([t]), False) def la_not_in_nonterminal(self, nt): return grammar.LookaheadRule(OrderedFrozenSet([nt]), False) def la_not_in_set(self, lookahead_exclusions): if all(len(excl) == 1 for excl in lookahead_exclusions): return grammar.LookaheadRule( OrderedFrozenSet(excl[0] for excl in lookahead_exclusions), False) raise ValueError("unsupported: lookahead > 1 token, {!r}" .format(lookahead_exclusions)) def chr(self, t): assert t[0] == "<" or t[0] == 'U' if t[0] == "<": assert t[-1] == ">" if t not in ECMASCRIPT_CODE_POINTS: raise ValueError("unrecognized character abbreviation {!r}".format(t)) return ECMASCRIPT_CODE_POINTS[t] else: assert t[1] == "+" return grammar.Literal(chr(int(t[2:], base=16))) def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals, single_grammar=True, extensions=[]): nt_grammars = {} for nt_name, eq, _ in nt_defs: if nt_name in nt_grammars: raise ValueError( "duplicate definitions for nonterminal {!r}" .format(nt_name)) nt_grammars[nt_name] = eq # Figure out which grammar we were trying to get (":" for syntactic, # "::" for lexical) based on the goal symbols. goals = list(goals) if len(goals) == 0: raise ValueError("no goal nonterminals specified") if single_grammar: selected_grammars = set(nt_grammars[goal] for goal in goals) assert len(selected_grammars) != 0 if len(selected_grammars) > 1: raise ValueError( "all goal nonterminals must be part of the same grammar; " "got {!r} (matching these grammars: {!r})" .format(set(goals), set(selected_grammars))) [selected_grammar] = selected_grammars terminal_set = set() def hack_production(p): for i, e in enumerate(p.body): if isinstance(e, str) and e[:1] == "`": if len(e) < 3 or e[-1:] != "`": raise ValueError( "Unrecognized grammar symbol: {!r} (in {!r})" .format(e, p)) p[i] = token = e[1:-1] terminal_set.add(token) nonterminals = {} for nt_name, eq, rhs_list_or_lambda in nt_defs: if single_grammar and eq != selected_grammar: continue if isinstance(rhs_list_or_lambda, grammar.NtDef): nonterminals[nt_name] = rhs_list_or_lambda else: rhs_list = rhs_list_or_lambda for p in rhs_list: if not isinstance(p, grammar.Production): raise ValueError( "invalid grammar: ifdef in non-function-call context") hack_production(p) if nt_name in nonterminals: raise ValueError( "unsupported: multiple definitions for nt " + nt_name) nonterminals[nt_name] = rhs_list for t in terminal_set: if t in nonterminals: raise ValueError( "grammar contains both a terminal `{}` and nonterminal {}" .format(t, t)) # Add execution modes to generate the various functions needed to handle # syntax parsing and full parsing execution modes. exec_modes = collections.defaultdict(OrderedSet) noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType)) token_parser = types.Type("ParserTrait", ( types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),)))) ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),)) # Full parsing takes token as input and build an AST. exec_modes["full_actions"].extend([token_parser, ast_builder]) # Syntax parsing takes token as input but skip building the AST. # TODO: The syntax parser is commented out for now, as we need something to # be produced when we cannot call the AstBuilder for producing the values. # No-op parsing is used for the simulator, which is so far used for # querying whether we can end the incremental input and lookup if a state # can accept some kind of tokens. exec_modes["noop_actions"].add(noop_parser) # Extensions are using an equivalent of Rust types to define the kind of # parsers to be used, this map is used to convert these type names to the # various execution modes. full_parser = types.Type("FullParser") syntax_parser = types.Type("SyntaxParser") noop_parser = types.Type("NoopParser") type_to_modes = { noop_parser: ["noop_actions", "full_actions"], syntax_parser: ["full_actions"], full_parser: ["full_actions"], } result = grammar.Grammar( nonterminals, goal_nts=goals, variable_terminals=variable_terminals, synthetic_terminals=synthetic_terminals, exec_modes=exec_modes, type_to_modes=type_to_modes) result.patch(extensions) return result def parse_esgrammar( text: str, *, filename: Optional[str] = None, extensions: Iterable[Tuple[os.PathLike, int, str]] = (), goals: Optional[Iterable[str]] = None, terminal_names: Iterable[str] = (), synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None, single_grammar: bool = True ) -> grammar.Grammar: if not text.endswith("\n\n"): # Horrible hack: add a blank line at the end of the document so that # the esgrammar grammar can use newlines as delimiters. :-P text += "\n" terminal_names = frozenset(terminal_names) if synthetic_terminals is None: synthetic_terminals = {} builder = ESGrammarBuilder(terminal_names) parser = ESGrammarParser(builder=builder, goal="grammar") lexer = ESGrammarLexer(parser, filename=filename) lexer.write(text) nt_defs = lexer.close() grammar_extensions = [] for ext_filename, start_lineno, content in extensions: builder.reset() parser = ESGrammarParser(builder=builder, goal="rust_edsl") lexer = ESGrammarLexer(parser, filename=ext_filename) builder.lexer = lexer lexer.start_lineno = start_lineno lexer.write(content) result = lexer.close() grammar_extensions.append(result) if goals is None: # Default to the first nonterminal in the input. goals = [nt_defs[0][0]] return finish_grammar( nt_defs, goals=goals, variable_terminals=terminal_names - frozenset(synthetic_terminals), synthetic_terminals=synthetic_terminals, single_grammar=single_grammar, extensions=grammar_extensions)