diff options
Diffstat (limited to 'third_party/rust/jsparagus/jsparagus/grammar.py')
-rw-r--r-- | third_party/rust/jsparagus/jsparagus/grammar.py | 1248 |
1 files changed, 1248 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/jsparagus/grammar.py b/third_party/rust/jsparagus/jsparagus/grammar.py new file mode 100644 index 0000000000..bcaf5a02ae --- /dev/null +++ b/third_party/rust/jsparagus/jsparagus/grammar.py @@ -0,0 +1,1248 @@ +""" Data structures for representing grammars. """ + +from __future__ import annotations +# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls + +import copy +import typing +import dataclasses +from dataclasses import dataclass +from .ordered import OrderedSet, OrderedFrozenSet +from . import types + + +# *** What is a grammar? ****************************************************** +# +# A grammar is a dictionary mapping nonterminal names to lists of right-hand +# sides. Each right-hand side (also called a "production") is a list whose +# elements can include terminals, nonterminals, Optional elements, LookaheadRules, +# and NoLineTerminatorHere. +# +# The most common elements are terminals and nonterminals, so a grammar usually +# looks something like this: +def example_grammar() -> Grammar: + rules: typing.Dict[typing.Union[str, InitNt, Nt], LenientNtDef] = { + 'expr': [ + ['term'], + ['expr', '+', 'term'], + ['expr', '-', 'term'], + ], + 'term': [ + ['unary'], + ['term', '*', 'unary'], + ['term', '/', 'unary'], + ], + 'unary': [ + ['prim'], + ['-', 'unary'], + ], + 'prim': [ + ['NUM'], + ['VAR'], + ['(', 'expr', ')'], + ], + } + + # The goal nonterminals are the nonterminals we're actually interested in + # parsing. Here we want to parse expressions; all the other nonterminals + # are interesting only as the building blocks of expressions. + # + # Variable terminals are terminal symbols that can have several different + # values, like a VAR token that could be any identifier, or a NUM token + # that could be any number. + return Grammar(rules, goal_nts=['expr'], variable_terminals=['NUM', 'VAR']) + + +Condition = typing.Tuple[str, bool] + + +# A production consists of a left side, an optional condition, a right side, +# and a reducer. A `Production` object includes everything except the left +# side. Incorporating reducers lets us transform a grammar while preserving +# behavior. +# +# The production `expr ::= term` is represented by +# `Production(["term"], 0)`. +# +# The production `expr ::= expr + term => add($0, $2)` is represented by +# `Production(["expr", "+", "term"], CallMethod("add", (0, 2))`. +# +@dataclass +class Production: + __slots__ = ['body', 'reducer', 'condition'] + body: typing.List[Element] + reducer: ReduceExprOrAccept + condition: typing.Optional[Condition] + + def __init__(self, + body: typing.List[Element], + reducer: ReduceExprOrAccept, + *, + condition: typing.Optional[Condition] = None): + self.body = body + self.reducer = reducer + self.condition = condition + + def __repr__(self) -> str: + if self.condition is None: + return "Production({!r}, reducer={!r})".format(self.body, self.reducer) + else: + return ("Production({!r}, reducer={!r}, condition={!r})" + .format(self.body, self.reducer, self.condition)) + + def copy_with(self, **kwargs: typing.Any) -> Production: + return dataclasses.replace(self, **kwargs) + + +# *** Reduce expressions ****************************************************** +# +# Reduce expressions ("reducers" for short) are a little language used to +# specify what happens when a production is matched. A reduce expression is +# one of these: +# +# * An integer evaluates to one of the syntactic components of the +# production. For example, if the production we're reducing is +# `sum ::= sum + term`, the integer `0` evaluates the `sum` to the left of +# the plus sign, and `2` means the `term` on the right. (The integer +# `1` would refer to the `+` token itself, but that's not super useful.) +# +# These integers are not quite indexes into `production.body`, because +# LookaheadRule, ErrorSymbol, and NoLineTerminatorHere elements don't +# count: in the production `stmt ::= [lookahead != "let"] expr ";"`, `0` is +# the expr, and `1` is the semicolon token. See `is_concrete_element(e)`. +# +# * CallMethod objects pass values to a builder method and return the result. +# The `args` are nested reduce expressions. +# +# * None is an expression used as a placeholder when an optional symbol is +# omitted. +# +# * Some(expr) is used when an optional symbol is found and parsed. +# In Python, this just expands to the same thing as `expr`, but in Rust +# this expands to a use of `Option::Some()`. +# +# In addition, the special reducer 'accept' means stop parsing. This is +# used only in productions for init nonterminals, created automatically by +# Grammar.__init__(). It's not a reduce expression, so it can't be nested. + + +@dataclass(frozen=True) +class CallMethod: + """Express a method call, and give it a given set of arguments. A trait is + added as the parser should implement this trait to call this method.""" + + method: str + args: typing.Tuple[ReduceExpr, ...] + trait: types.Type = types.Type("AstBuilder") + fallible: bool = False + + +@dataclass(frozen=True) +class Some: + inner: ReduceExpr + + +def expr_to_str(expr: ReduceExprOrAccept) -> str: + if isinstance(expr, int): + return "${}".format(expr) + elif isinstance(expr, CallMethod): + return "{}::{}({}){}".format( + expr.trait, expr.method, + ', '.join(expr_to_str(arg) for arg in expr.args), + expr.fallible and '?' or '') + elif expr is None: + return "None" + elif isinstance(expr, Some): + return "Some({})".format(expr_to_str(expr.inner)) + elif expr == "accept": + return "<accept>" + else: + raise ValueError("unrecognized expression: {!r}".format(expr)) + + +# Type parameter for Grammar.intern(). +Internable = typing.TypeVar("Internable") + +SyntheticTerminalsDict = typing.Dict[str, OrderedFrozenSet[str]] + + +class Grammar: + """A collection of productions. + + * self.variable_terminals - OrderedFrozenSet[str] - Terminals that carry + data, like (in JS) numeric literals and RegExps. + + * self.terminals - OrderedFrozenSet[str] - All terminals used in the + language, including those in self.variable_terminals and + self.synthetic_terminals. + + * self.synthetic_terminals - {str: OrderedFrozenSet[str]} - Maps terminals + to sets of terminals. An entry `name: set` in this dictionary means + that `name` is shorthand for "one of the terminals in `set`". + + * self.nonterminals - {LenientNt: NtDef} - Keys are either (str|InitNt), early + in the pipeline, or Nt objects later on. Values are the NtDef objects + that contain the actual Productions. + + * self.methods - {str: MethodType} - Type information for methods. + + * self.init_nts - [InitNt or Nt] - The list of all elements of + self.nonterminals.keys() that are init nonterminals. + + * self.exec_modes - DefaultDict{str, OrderedSet[Type]} or None - ? + + * self.type_to_mods - {Type: [str]} or None - ? + + * self._cache - {Any: Any} - Cache of immutable objects used by + Grammar.intern(). + """ + + variable_terminals: OrderedFrozenSet[str] + terminals: OrderedFrozenSet[typing.Union[str, End]] + synthetic_terminals: SyntheticTerminalsDict + nonterminals: typing.Dict[LenientNt, NtDef] + methods: typing.Dict[str, types.MethodType] + init_nts: typing.List[typing.Union[Nt, InitNt]] + exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] + type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]] + _cache: typing.Dict[typing.Any, typing.Any] + + def __init__( + self, + nonterminals: typing.Mapping[LenientNt, LenientNtDef], + *, + goal_nts: typing.Optional[typing.Iterable[LenientNt]] = None, + variable_terminals: typing.Iterable[str] = (), + synthetic_terminals: SyntheticTerminalsDict = None, + method_types: typing.Optional[typing.Dict[str, types.MethodType]] = None, + exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] = None, + type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]] = None): + + # This constructor supports passing in a sort of jumbled blob of + # strings, lists, and actual objects, and normalizes it all to a more + # typeful structure. Being able to interpret simple + # "list-of-lists-of-strings" input is super useful for tests. + # + # We don't check here that the grammar is LR, that it's cycle-free, or + # any other nice properties. + + # Copy/infer the arguments. + my_nonterminals: typing.Dict[LenientNt, LenientNtDef] = dict(nonterminals.items()) + if goal_nts is None: + # Default to the first nonterminal in the dictionary. + my_goal_nts = [] + for name in my_nonterminals: + my_goal_nts.append(name) + break + else: + my_goal_nts = list(goal_nts) + self.variable_terminals = OrderedFrozenSet(variable_terminals) + if synthetic_terminals is None: + synthetic_terminals = {} + else: + synthetic_terminals = { + name: OrderedFrozenSet(set) + for name, set in synthetic_terminals.items() + } + for synthetic_key, values in synthetic_terminals.items(): + if synthetic_key in my_nonterminals: + raise ValueError( + "invalid grammar: {!r} is both a terminal and a nonterminal" + .format(synthetic_key)) + for t in values: + if t in my_nonterminals: + raise ValueError( + "invalid grammar: {!r} is both ".format(t) + + "a representation of a synthetic terminal and a nonterminal") + if t in synthetic_terminals: + # Nested synthetic terminals. Throw, for now. (This + # should pretty much just work, except expand_terminal + # doesn't support it; and we don't check for cycles + # where a synthetic terminal includes itself directly + # or indirectly.) + raise ValueError( + "unsupported: synthetic terminals can't include other " + "synthetic terminals; {!r} includes {!r}" + .format(synthetic_key, t)) + # self.synthetic_terminals = synthetic_terminals + self.synthetic_terminals = {} + + keys_are_nt = isinstance(next(iter(my_nonterminals)), Nt) + key_type: typing.Union[typing.Type, typing.Tuple[typing.Type, ...]] + key_type = Nt if keys_are_nt else (str, InitNt) + + self._cache = {} + + # Gather some information just by looking at keys (without examining + # every production). + # + # str_to_nt maps the name of each non-parameterized + # nonterminal to `Nt(name)`, a cache. + str_to_nt: typing.Dict[typing.Union[str, InitNt], Nt] = {} + # nt_params lists the names of each nonterminal's parameters (empty + # tuple for non-parameterized nts). + nt_params: typing.Dict[typing.Union[str, InitNt], typing.Tuple[str, ...]] = {} + for key in my_nonterminals: + if not isinstance(key, key_type): + raise ValueError( + "invalid grammar: conflicting key types in nonterminals dict - " + "expected either all str or all Nt, got {!r}" + .format(key.__class__.__name__)) + nt_name: typing.Union[str, InitNt] + param_names: typing.Tuple[str, ...] + if keys_are_nt: + assert isinstance(key, Nt) + nt_name = key.name + param_names = tuple(name for name, value in key.args) + else: + assert isinstance(key, (str, InitNt)) + nt_name = key + param_names = () + my_nt = my_nonterminals[key] + if isinstance(my_nt, NtDef): + param_names = tuple(my_nt.params) + if nt_name not in nt_params: + nt_params[nt_name] = param_names + else: + if nt_params[nt_name] != param_names: + raise ValueError( + "conflicting parameter name lists for nt {!r}: " + "both {!r} and {!r}" + .format(nt_name, nt_params[nt_name], param_names)) + if param_names == () and nt_name not in str_to_nt: + str_to_nt[nt_name] = self.intern(Nt(nt_name)) + + # Validate, desugar, and copy the grammar. As a side effect, calling + # validate_element on every element of the grammar populates + # all_terminals. + all_terminals: OrderedSet[typing.Union[str, End]] = OrderedSet(self.variable_terminals) + all_terminals.add(End()) + + def note_terminal(t: str) -> None: + """Add t (and all representations of it, if synthetic) to all_terminals.""" + if t not in all_terminals: + all_terminals.add(t) + if t in self.synthetic_terminals: + for k in self.synthetic_terminals[t]: + note_terminal(k) + + # Note: i and j are normally list indexes, but they are sometimes the + # special string '?'. It's OK because they are used only in error + # messages. + def validate_element( + nt: LenientNt, + i: typing.Union[int, str], + j: typing.Union[int, str], + e: Element, + context_params: typing.Tuple[str, ...] + ) -> Element: + if isinstance(e, str): + if e in nt_params: + if nt_params[e] != (): + raise ValueError( + "invalid grammar: missing parameters for {!r} " + "in production `grammar[{!r}][{}][{}].inner`: {!r}" + .format(e, nt, i, j, nt_params[e])) + return str_to_nt[e] + else: + note_terminal(e) + return e + elif isinstance(e, Optional): + if not isinstance(e.inner, (str, Nt)): + raise TypeError( + "invalid grammar: unrecognized element " + "in production `grammar[{!r}][{}][{}].inner`: {!r}" + .format(nt, i, j, e.inner)) + inner = validate_element(nt, i, j, e.inner, context_params) + return self.intern(Optional(inner)) + elif isinstance(e, Literal): + if not isinstance(e.text, str): + raise TypeError( + "invalid grammar: unrecognized element " + "in production `grammar[{!r}][{}][{}].text`: {!r}" + .format(nt, i, j, e.text)) + return self.intern(e) + elif isinstance(e, UnicodeCategory): + if not isinstance(e.cat_prefix, str): + raise TypeError( + "invalid grammar: unrecognized element " + "in production `grammar[{!r}][{}][{}].cat_prefix`: {!r}" + .format(nt, i, j, e.cat_prefix)) + return self.intern(e) + elif isinstance(e, Exclude): + if not isinstance(e.inner, (str, Nt)): + raise TypeError( + "invalid grammar: unrecognized element " + "in production `grammar[{!r}][{}][{}].inner`: {!r}" + .format(nt, i, j, e.inner)) + exclusion_list = [] + for value in e.exclusion_list: + if not isinstance(value, (str, Nt)): + raise TypeError( + "invalid grammar: unrecognized element " + "in production `grammar[{!r}][{}][{}].exclusion_list`: {!r}" + .format(nt, i, j, value)) + value = validate_element(nt, i, j, value, context_params) + exclusion_list.append(value) + inner = validate_element(nt, i, j, e.inner, context_params) + return self.intern(Exclude(inner, tuple(exclusion_list))) + elif isinstance(e, Nt): + # Either the application or the original parameterized + # production must be present in the dictionary. + if e not in my_nonterminals and e.name not in my_nonterminals: + raise ValueError( + "invalid grammar: unrecognized nonterminal " + "in production `grammar[{!r}][{}][{}]`: {!r}" + .format(nt, i, j, e.name)) + args = tuple(pair[0] for pair in e.args) + if e.name in nt_params and args != nt_params[e.name]: + raise ValueError( + "invalid grammar: wrong arguments passed to {!r} " + "in production `grammar[{!r}][{}][{}]`: " + "passed {!r}, expected {!r}" + .format(e.name, nt, i, j, + args, nt_params[e.name])) + for param_name, arg_expr in e.args: + if isinstance(arg_expr, Var): + if arg_expr.name not in context_params: + raise ValueError( + "invalid grammar: undefined variable {!r} " + "in production `grammar[{!r}][{}][{}]`" + .format(arg_expr.name, nt, i, j)) + return self.intern(e) + elif isinstance(e, (LookaheadRule, End, ErrorSymbol)): + return self.intern(e) + elif e is NoLineTerminatorHere: + return e + elif isinstance(e, CallMethod): + return self.intern(e) + else: + raise TypeError( + "invalid grammar: unrecognized element in production " + "`grammar[{!r}][{}][{}]`: {!r}" + .format(nt, i, j, e)) + assert False, "unreachable" + + def check_reduce_expr( + nt: LenientNt, + i: int, + rhs: Production, + expr: ReduceExprOrAccept) -> None: + if isinstance(expr, int): + concrete_len = sum(1 for e in rhs.body + if is_concrete_element(e)) + if not (0 <= expr < concrete_len): + raise ValueError( + "invalid grammar: element number {} out of range for " + "production {!r} in grammar[{!r}][{}].reducer ({!r})" + .format(expr, nt, rhs.body, i, rhs.reducer)) + elif isinstance(expr, CallMethod): + if not isinstance(expr.method, str): + raise TypeError( + "invalid grammar: method names must be strings, " + "not {!r}, in grammar[{!r}[{}].reducer" + .format(expr.method, nt, i)) + if not expr.method.isidentifier(): + name, space, pn = expr.method.partition(' ') + if space == ' ' and name.isidentifier() and pn.isdigit(): + pass + else: + raise ValueError( + "invalid grammar: invalid method name {!r} " + "(not an identifier), in grammar[{!r}[{}].reducer" + .format(expr.method, nt, i)) + for arg_expr in expr.args: + check_reduce_expr(nt, i, rhs, arg_expr) + elif expr is None: + pass + elif isinstance(expr, Some): + check_reduce_expr(nt, i, rhs, expr.inner) + else: + raise TypeError( + "invalid grammar: unrecognized reduce expression {!r} " + "in grammar[{!r}][{}].reducer" + .format(expr, nt, i)) + + def copy_rhs( + nt: LenientNt, + i: int, + sole_production: bool, + rhs: LenientProduction, + context_params: typing.Tuple[str, ...]) -> Production: + if isinstance(rhs, list): + # Bare list, no reducer. Desugar to a Production, inferring a + # reasonable default reducer. + nargs = sum(1 for e in rhs if is_concrete_element(e)) + reducer: ReduceExpr + if len(rhs) == 1 and nargs == 1: + reducer = 0 # don't call a method, just propagate the value + else: + # Call a method named after the production. If the + # nonterminal has exactly one production, there's no need + # to include the production index `i` to the method name. + if sole_production: + method = str(nt) + else: + method = '{}_{}'.format(nt, i) + reducer = CallMethod(method, tuple(range(nargs))) + rhs = Production(rhs, reducer) + + if not isinstance(rhs, Production): + raise TypeError( + "invalid grammar: grammar[{!r}][{}] should be " + "a Production or list of grammar symbols, not {!r}" + .format(nt, i, rhs)) + + if rhs.condition is not None: + param, value = rhs.condition + if param not in context_params: + raise TypeError( + "invalid grammar: undefined parameter {!r} " + "in conditional for grammar[{!r}][{}]" + .format(param, nt, i)) + if rhs.reducer != 'accept': + check_reduce_expr(nt, i, rhs, rhs.reducer) + assert isinstance(rhs.body, list) + return rhs.copy_with(body=[ + validate_element(nt, i, j, e, context_params) + for j, e in enumerate(rhs.body) + ]) + + def copy_nt_def( + nt: LenientNt, + nt_def: typing.Union[NtDef, typing.List[LenientProduction]], + ) -> NtDef: + rhs_list: typing.Sequence[LenientProduction] + if isinstance(nt_def, NtDef): + for i, param in enumerate(nt_def.params): + if not isinstance(param, str): + raise TypeError( + "invalid grammar: parameter {} of {} should be " + "a string, not {!r}" + .format(i + 1, nt, param)) + params = nt_def.params + rhs_list = nt_def.rhs_list + ty = nt_def.type + else: + params = () + rhs_list = nt_def + ty = None + + if not isinstance(rhs_list, list): + raise TypeError( + "invalid grammar: grammar[{!r}] should be either a " + "list of right-hand sides or NtDef, not {!r}" + .format(nt, type(rhs_list).__name__)) + + sole_production = len(rhs_list) == 1 + productions = [copy_rhs(nt, i, sole_production, rhs, params) + for i, rhs in enumerate(rhs_list)] + return NtDef(params, productions, ty) + + def check_nt_key(nt: LenientNt) -> None: + if isinstance(nt, str): + if not nt.isidentifier(): + raise ValueError( + "invalid grammar: nonterminal names must be identifiers, not {!r}" + .format(nt)) + if nt in self.variable_terminals or nt in self.synthetic_terminals: + raise TypeError( + "invalid grammar: {!r} is both a nonterminal and a variable terminal" + .format(nt)) + elif isinstance(nt, Nt): + assert keys_are_nt # checked earlier + if not (isinstance(nt.name, (str, InitNt)) + and isinstance(nt.args, tuple)): + raise TypeError( + "invalid grammar: expected str or Nt(name=str, " + "args=tuple) keys in nonterminals dict, got {!r}" + .format(nt)) + check_nt_key(nt.name) + for pair in nt.args: + if (not isinstance(pair, tuple) + or len(pair) != 2 + or not isinstance(pair[0], str) + or not isinstance(pair[1], bool)): + raise TypeError( + "invalid grammar: expected tuple((str, bool)) args, got {!r}" + .format(nt)) + elif isinstance(nt, InitNt): + # Users don't include init nonterminals when initially creating + # a Grammar. They are automatically added below. But if this + # Grammar is being created by hacking on a previous Grammar, it + # will already have them. + if not isinstance(nt.goal, Nt): + raise TypeError( + "invalid grammar: InitNt.goal should be a nonterminal, " + "got {!r}" + .format(nt)) + # nt.goal is a "use", not a "def". Check it like a use. + # Bogus question marks appear in error messages :-| + validate_element(nt, '?', '?', nt.goal, ()) + if nt.goal not in my_goal_nts: + raise TypeError( + "invalid grammar: nonterminal referenced by InitNt " + "is not in the list of goals: {!r}" + .format(nt)) + else: + raise TypeError( + "invalid grammar: expected string keys in nonterminals dict, got {!r}" + .format(nt)) + + def validate_nt( + nt: LenientNt, + nt_def: LenientNtDef + ) -> typing.Tuple[LenientNt, NtDef]: + check_nt_key(nt) + if isinstance(nt, InitNt): + # Check the form of init productions. Initially these look like + # [[goal]], but after the pipeline goes to work, they can be + # [[Optional(goal)]] or [[], [goal]]. + if not isinstance(nt_def, NtDef): + raise TypeError( + "invalid grammar: key {!r} must map to " + "value of type NtDef, not {!r}" + .format(nt, nt_def)) + rhs_list = nt_def.rhs_list + g = nt.goal + if (rhs_list != [Production([g], 0), + Production([Nt(nt, ()), End()], 'accept')] + and rhs_list != [Production([Optional(g)], 0), + Production([Nt(nt, ()), End()], 'accept')] + and rhs_list != [Production([End()], 'accept'), + Production([g, End()], 'accept'), + Production([Nt(nt, ()), End()], 'accept')]): + raise ValueError( + "invalid grammar: grammar[{!r}] is not one of " + "the expected forms: got {!r}" + .format(nt, rhs_list)) + + return nt, copy_nt_def(nt, nt_def) + + self.nonterminals = {} + for nt1, nt_def1 in my_nonterminals.items(): + nt, nt_def = validate_nt(nt1, nt_def1) + self.nonterminals[nt] = nt_def + for syn_term_name, t_set in synthetic_terminals.items(): + nt_def = NtDef((syn_term_name,), [Production([e], 0) for e in t_set], None) + nt, nt_def = validate_nt(syn_term_name, nt_def) + self.nonterminals[nt] = nt_def + # Remove synthetic terminals from the list of terminals. + all_terminals.remove(syn_term_name) + + self.terminals = OrderedFrozenSet(all_terminals) + + # Check types of reduce expressions and infer method types. But if the + # caller passed in precalculated type info, skip it -- otherwise we + # would redo type checking many times as we make minor changes to the + # Grammar along the pipeline. + if method_types is None: + types.infer_types(self) + else: + for nt, nt_def in self.nonterminals.items(): + assert isinstance(nt_def, NtDef) + assert isinstance(nt_def.type, types.Type) + self.methods = method_types + + # Synthesize "init" nonterminals. + self.init_nts = [] + for goal in my_goal_nts: + # Convert str goals to Nt objects and validate. + if isinstance(goal, str): + ok = goal in str_to_nt + if ok: + goal = str_to_nt[goal] + elif isinstance(goal, Nt): + if keys_are_nt: + ok = goal in my_nonterminals + else: + ok = goal.name in my_nonterminals + if not ok: + raise ValueError( + "goal nonterminal {!r} is undefined".format(goal)) + assert isinstance(goal, Nt) + + # Weird, but the key of an init nonterminal really is + # `Nt(InitNt(Nt(goal_name, goal_args)), ())`. It takes no arguments, + # but it refers to a goal that might take arguments. + init_nt = InitNt(goal) + init_key: LenientNt = init_nt + goal_nt = Nt(init_nt, ()) + if keys_are_nt: + init_key = goal_nt + if init_key not in self.nonterminals: + self.nonterminals[init_key] = NtDef( + (), + [Production([goal], 0), + Production([goal_nt, End()], 'accept')], + types.NoReturnType) + self.init_nts.append(goal_nt) + + # Add the various execution backends which would rely on the same parse table. + self.exec_modes = exec_modes + self.type_to_modes = type_to_modes + + # The argument is a list of extension.GrammarExtensions. The annotation is + # vague because this module does not import the extension module. It would + # be a cyclic dependency. + def patch(self, extensions: typing.List) -> None: + assert self.type_to_modes is not None + assert self.exec_modes is not None + if extensions == []: + return + # Copy of nonterminals which would be mutated by the patches. + nonterminals = copy.copy(self.nonterminals) + for ext in extensions: + # Add the given trait to the execution mode, depending on which + # type it got implemented for. + for mode in self.type_to_modes[ext.target.for_type]: + self.exec_modes[mode].add(ext.target.trait) + # Apply grammar transformations. + ext.apply_patch(self, nonterminals) + # Replace with the modified version of nonterminals + self.nonterminals = nonterminals + + def intern(self, obj: Internable) -> Internable: + """Return a shared copy of the immutable object `obj`. + + This saves memory and consistent use allows code to use `is` for + equality testing. + """ + try: + return self._cache[obj] + except KeyError: + self._cache[obj] = obj + return obj + + # Terminals are tokens that must appear verbatim in the input wherever they + # appear in the grammar, like the operators '+' '-' *' '/' and brackets '(' ')' + # in the example grammar. + def is_terminal(self, element: object) -> bool: + return type(element) is str + + def expand_set_of_terminals( + self, + terminals: typing.Iterable[typing.Union[str, None, ErrorSymbol]] + ) -> OrderedSet[typing.Union[str, None, ErrorSymbol]]: + """Copy a set of terminals, replacing any synthetic terminals with their representations. + + Returns a new OrderedSet. + + terminals - an iterable of terminals and/or other unique elements like + None or ErrorSymbol. + """ + result: OrderedSet[typing.Union[str, None, ErrorSymbol]] = OrderedSet() + for t in terminals: + if isinstance(t, str) and t in self.synthetic_terminals: + result |= self.expand_set_of_terminals(self.synthetic_terminals[t]) + else: + result.add(t) + return result + + def goals(self) -> typing.List[Nt]: + """Return a list of this grammar's goal nonterminals.""" + return [init_nt.name.goal for init_nt in self.init_nts] # type: ignore + + def with_nonterminals( + self, + nonterminals: typing.Mapping[LenientNt, LenientNtDef] + ) -> Grammar: + """Return a copy of self with the same attributes except different nonterminals.""" + if self.methods is not None: + for nt_def in nonterminals.values(): + assert isinstance(nt_def, NtDef) + assert nt_def.type is not None + return Grammar( + nonterminals, + goal_nts=self.goals(), + variable_terminals=self.variable_terminals, + synthetic_terminals=self.synthetic_terminals, + method_types=self.methods, + exec_modes=self.exec_modes, + type_to_modes=self.type_to_modes) + + # === A few methods for dumping pieces of grammar. + + def element_to_str(self, e: Element) -> str: + if isinstance(e, Nt): + return e.pretty() + elif self.is_terminal(e): + assert isinstance(e, str) + if e in self.variable_terminals or e in self.synthetic_terminals: + return e + return '"' + repr(e)[1:-1] + '"' + elif isinstance(e, Optional): + return self.element_to_str(e.inner) + "?" + elif isinstance(e, LookaheadRule): + if len(e.set) == 1: + op = "==" if e.positive else "!=" + s = repr(list(e.set)[0]) + else: + op = "in" if e.positive else "not in" + s = '{' + repr(list(e.set))[1:-1] + '}' + return "[lookahead {} {}]".format(op, s) + elif isinstance(e, End): + return "<END>" + elif e is NoLineTerminatorHere: + return "[no LineTerminator here]" + elif isinstance(e, CallMethod): + return "{{ {} }}".format(expr_to_str(e)) + else: + return str(e) + + def symbols_to_str(self, rhs: typing.Iterable[Element]) -> str: + return " ".join(self.element_to_str(e) for e in rhs) + + def rhs_to_str(self, rhs: LenientProduction) -> str: + if isinstance(rhs, Production): + if rhs.condition is None: + prefix = '' + else: + param, value = rhs.condition + if value is True: + condition = "+" + param + elif value is False: + condition = "~" + param + else: + condition = "{} == {!r}".format(param, value) + prefix = "#[if {}] ".format(condition) + return prefix + self.rhs_to_str(rhs.body) + elif len(rhs) == 0: + return "[empty]" + else: + return self.symbols_to_str(rhs) + + def nt_to_str(self, nt: LenientNt) -> str: + if isinstance(nt, Nt): + return self.element_to_str(nt) + else: + return str(nt) + + def production_to_str( + self, + nt: LenientNt, + rhs: LenientProduction, + *reducer: ReduceExpr + ) -> str: + # As we have two ways of representing productions at the moment, just + # take multiple arguments :( + return "{} ::= {}{}".format( + self.nt_to_str(nt), + self.rhs_to_str(rhs), + "".join(" => " + expr_to_str(expr) for expr in reducer)) + + # The type of `item` is `lr0.LRItem`. No annotation because this module + # does not import `lr0`. It would be a cyclic dependency. + def lr_item_to_str(self, prods: typing.List, item: typing.Any) -> str: + prod = prods[item.prod_index] + if item.lookahead is None: + la = [] + else: + la = [self.element_to_str(item.lookahead)] + return "{} ::= {} >> {{{}}}".format( + self.element_to_str(prod.nt), + " ".join([self.element_to_str(e) for e in prod.rhs[:item.offset]] + + ["\N{MIDDLE DOT}"] + + la + + [self.element_to_str(e) for e in prod.rhs[item.offset:]]), + ", ".join( + "$" if t is None else self.element_to_str(t) + for t in item.followed_by) + ) + + def item_set_to_str( + self, + prods: typing.List, + item_set: OrderedFrozenSet + ) -> str: + return "{{{}}}".format( + ", ".join(self.lr_item_to_str(prods, item) for item in item_set) + ) + + def expand_terminal(self, t: str) -> OrderedFrozenSet[str]: + return self.synthetic_terminals.get(t) or OrderedFrozenSet([t]) + + def compatible_elements(self, e1: Element, e2: Element) -> bool: + # "type: ignore" because mypy doesn't know that `self.is_terminal(e1)` + # means `e1` is a terminal, and thus `self.expand_terminal(e1)` is OK. + return (e1 == e2 + or (self.is_terminal(e1) + and self.is_terminal(e2) + and len(self.expand_terminal(e1) # type: ignore + & self.expand_terminal(e2)) > 0)) # type: ignore + + def compatible_sequences( + self, + seq1: typing.Sequence[Element], + seq2: typing.Sequence[Element]) -> bool: + """True if the two sequences could be the same terminals.""" + return (len(seq1) == len(seq1) + and all(self.compatible_elements(e1, e2) for e1, e2 in zip(seq1, seq2))) + + def dump(self) -> None: + for nt, nt_def in self.nonterminals.items(): + left_side = self.nt_to_str(nt) + if nt_def.params: + left_side += "[" + ", ".join(nt_def.params) + "]" + print(left_side + " ::=") + for rhs in nt_def.rhs_list: + print(" ", self.rhs_to_str(rhs)) + print() + + def dump_type_info(self) -> None: + for nt, nt_def in self.nonterminals.items(): + print(nt, nt_def.type) + for name, mty in self.methods.items(): + print("fn {}({}) -> {}" + .format(name, + ", ".join(str(ty) for ty in mty.argument_types), + str(mty.return_type))) + + def is_shifted_element(self, e: Element) -> bool: + if isinstance(e, Nt): + return True + elif self.is_terminal(e): + return True + elif isinstance(e, Optional): + return True + elif isinstance(e, LookaheadRule): + return False + elif isinstance(e, End): + return True + elif e is NoLineTerminatorHere: + return True + return False + + +@dataclass(frozen=True) +class InitNt: + """InitNt(goal) is the name of the init nonterminal for the given goal. + + One init nonterminal is created internally for each goal symbol in the grammar. + + The idea is to have a nonterminal that the user has no control over, that is + never used in any production, but *only* as an entry point for the grammar, + that always has a single production "init_nt ::= goal_nt". This predictable + structure makes it easier to get into and out of parsing at run time. + + When an init nonterminal is matched, we take the "accept" action rather than + a "reduce" action. + """ + goal: Nt + + +# *** Elements **************************************************************** +# +# Elements are the things that can appear in the .body list of a Production: +# +# * Strings represent terminals (see `Grammar.is_terminal`) +# +# * `Nt` objects refer to nonterminals. +# +# * `Optional` objects represent optional elements. +# +# * `LookaheadRule` objects are like lookahead assertions in regular +# expressions. +# +# * The `NoLineTerminatorHere` singleton object can appear between two other +# symbols to rule out line breaks between them. +# +# * `ErrorSymbol` objects never match anything produced by the lexer. Instead +# they match an ErrorToken that's artificially injected into the token +# stream at runtime, by the parser itself, just before a token that does +# not match anything else. + + +def is_concrete_element(e: Element) -> bool: + """True if parsing the element `e` produces a value. + + A production's concrete elements can be used in reduce expressions. + """ + return not isinstance(e, (LookaheadRule, ErrorSymbol, NoLineTerminatorHereClass)) + + +# Nonterminals in the ECMAScript grammar can be parameterized; NtParameter is +# the type of the parameters. +# +# For example, `BindingIdentifier[?Yield, ?Await]` is represented as +# `Nt('BindingIdentifier', (('Yield', Var('Yield')), ('Await', Var('Await'))))`. +# +# A nonterminal-parameter-expression is represented by either a Var object or +# the actual value, a boolean. (In theory, parameters don't *have* to be +# boolean; all the code would probably work for anything hashable. In practice, +# all parameters in the ECMAScript grammar are boolean.) +NtParameter = typing.Hashable + + +class Nt: + """Nt(name, ((param0, arg0), ...)) - An invocation of a nonterminal. + + Nonterminals are like lambdas. Each nonterminal in a grammar is defined by an + NtDef which has 0 or more parameters. + + Parameter names `param0...` are strings. The actual arguments `arg0...` are + NtParameters (see above). + """ + + __slots__ = ['name', 'args'] + + name: typing.Union[str, InitNt] + args: typing.Tuple[typing.Tuple[str, NtParameter], ...] + + def __init__(self, + name: typing.Union[str, InitNt], + args: typing.Tuple[typing.Tuple[str, NtParameter], ...] = ()): + self.name = name + self.args = args + + def __hash__(self) -> int: + return hash(('nt', self.name, self.args)) + + def __eq__(self, other: object) -> bool: + return (isinstance(other, Nt) + and (self.name, self.args) == (other.name, other.args)) + + def __repr__(self) -> str: + if self.args: + return 'Nt({!r}, {!r})'.format(self.name, self.args) + else: + return 'Nt({!r})'.format(self.name) + + def pretty(self) -> str: + """Unique version of this Nt to use in the Python runtime. + + Also used in debug/verbose output. + """ + def arg_to_str(name: str, value: NtParameter) -> str: + if value is True: + return '+' + name + elif value is False: + return '~' + name + elif isinstance(value, Var): + if value.name == name: + return '?' + value.name + return name + "=" + value.name + else: + return name + "=" + repr(value) + + if isinstance(self.name, InitNt): + return "Start_" + self.name.goal.pretty() + if len(self.args) == 0: + return self.name + return "{}[{}]".format(self.name, + ", ".join(arg_to_str(name, value) + for name, value in self.args)) + + +@dataclass(frozen=True) +class Optional: + """Optional(nt) matches either nothing or the given nt. + + Optional elements are expanded out before states are calculated, so the + core of the algorithm never sees them. + """ + inner: Element + + +@dataclass(frozen=True) +class Literal: + """Literal(str) matches a sequence of characters. + + Literal elements are sequences of characters which are expected to appear + verbatim in the input. + """ + text: str + + +@dataclass(frozen=True) +class UnicodeCategory: + """UnicodeCategory(str) matches any character with a category matching + the cat_prefix. + + UnicodeCategory elements are a set of literal elements which correspond to a + given unicode cat_prefix. + """ + cat_prefix: str + + +@dataclass(frozen=True) +class LookaheadRule: + """LookaheadRule(set, pos) imposes a lookahead restriction on whatever follows. + + It never consumes any tokens itself. Instead, the right-hand side + [LookaheadRule(frozenset(['a', 'b']), False), 'Thing'] + matches a Thing that does not start with the token `a` or `b`. + """ + set: typing.FrozenSet[str] + positive: bool + + +# A lookahead restriction really just specifies a set of allowed terminals. +# +# - No lookahead restriction at all is equivalent to a rule specifying all terminals. +# +# - A positive lookahead restriction explicitly lists all allowed tokens. +# +# - A negative lookahead restriction instead specifies the set of all tokens +# except a few. +# +def lookahead_contains(rule: typing.Optional[LookaheadRule], t: str) -> bool: + """True if the given lookahead restriction `rule` allows the terminal `t`.""" + return (rule is None + or (t in rule.set if rule.positive + else t not in rule.set)) + + +def lookahead_intersect( + a: typing.Optional[LookaheadRule], + b: typing.Optional[LookaheadRule] +) -> typing.Optional[LookaheadRule]: + """Returns a single rule enforcing both `a` and `b`, allowing only terminals that pass both.""" + if a is None: + return b + elif b is None: + return a + elif a.positive: + if b.positive: + return LookaheadRule(a.set & b.set, True) + else: + return LookaheadRule(a.set - b.set, True) + else: + if b.positive: + return LookaheadRule(b.set - a.set, True) + else: + return LookaheadRule(a.set | b.set, False) + + +class NoLineTerminatorHereClass: + def __str__(self) -> str: + return 'NoLineTerminatorHere' + + +NoLineTerminatorHere = NoLineTerminatorHereClass() + + +# Optional elements. These are expanded out before states are calculated, +# so the core of the algorithm never sees them. +@dataclass(frozen=True) +class Exclude: + """Exclude(nt1, nt2) matches if nt1 matches and nt2 does not.""" + inner: Element + exclusion_list: typing.Tuple[Element, ...] + + +# End. This is used to represent the terminal which is infinitely produced by +# the lexer when input end is reached. +@dataclass(frozen=True) +class End: + """End() represents the end of the input content.""" + + +# Special grammar symbol that can be consumed to handle a syntax error. +# +# The error code is passed to an error-handling routine at run time which +# decides if the error is recoverable or not. +@dataclass(frozen=True) +class ErrorSymbol: + """Special grammar symbol that can be consumed to handle a syntax error.""" + error_code: int + + +Element = typing.Union[ + str, + Optional, + Literal, + UnicodeCategory, + Exclude, + Nt, + LookaheadRule, + End, + ErrorSymbol, + NoLineTerminatorHereClass, + CallMethod] + + +@dataclass +class NtDef: + """Definition of a nonterminal. + + Instances have three attributes: + + .params - Tuple of strings, the names of the parameters. + + .rhs_list - List of Production objects. Arguments to Nt elements in the + productions can be Var(s) where `s in params`, indicating that parameter + should be passed through unchanged. + + .type - The type of runtime value produced by parsing an instance of this + nonterminal, or None. + + An NtDef is a sort of lambda. + + Some langauges have constructs that are allowed or disallowed in particular + situations. For example, in many languages `return` statements are allowed + only inside functions or methods. The ECMAScript standard (5.1.5 "Grammar + Notation") offers this example of the notation it uses to specify this sort + of thing: + + StatementList [Return] : + [+Return] ReturnStatement + ExpressionStatement + + This is an abbreviation for: + + StatementList : + ExpressionStatement + + StatementList_Return : + ReturnStatement + ExpressionStatement + + We offer NtDef.params as a way of representing this in our system. + + "StatementList": NtDef(("Return",), [ + Production(["ReturnStatement"], condition=("Return", True)), + ["ExpressionStatement"], + ], None), + + This is an abbreviation for: + + "StatementList_0": [ + ["ExpressionStatement"], + ], + "StatementList_1": [ + ["ReturnStatement"], + ["ExpressionStatement"], + ], + + """ + + __slots__ = ['params', 'rhs_list', 'type'] + + params: typing.Tuple[str, ...] + rhs_list: typing.List[Production] + type: typing.Optional[types.Type] + + def with_rhs_list(self, new_rhs_list: typing.List[Production]) -> NtDef: + return dataclasses.replace(self, rhs_list=new_rhs_list) + + +@dataclass(frozen=True) +class Var: + """Var(name) represents the run-time value of the parameter with the given name.""" + + name: str + + +ReduceExpr = typing.Union[int, CallMethod, None, Some] +ReduceExprOrAccept = typing.Union[ReduceExpr, str] + +# The Grammar constructor is very lax about the types you pass to it. It can +# accept a `Dict[str, List[List[str]]]`, for example; it copies the data into +# NtDef and Production objects. +# +# The `Lenient` types below are the relaxed types that Grammar() actually +# accepts. +LenientNt = typing.Union[Nt, str, InitNt] +LenientProduction = typing.Union[Production, typing.List[Element]] +LenientNtDef = typing.Union[NtDef, typing.List[LenientProduction]] |