""" Data structures for representing grammars. """ from __future__ import annotations # mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls import copy import typing import dataclasses from dataclasses import dataclass from .ordered import OrderedSet, OrderedFrozenSet from . import types # *** What is a grammar? ****************************************************** # # A grammar is a dictionary mapping nonterminal names to lists of right-hand # sides. Each right-hand side (also called a "production") is a list whose # elements can include terminals, nonterminals, Optional elements, LookaheadRules, # and NoLineTerminatorHere. # # The most common elements are terminals and nonterminals, so a grammar usually # looks something like this: def example_grammar() -> Grammar: rules: typing.Dict[typing.Union[str, InitNt, Nt], LenientNtDef] = { 'expr': [ ['term'], ['expr', '+', 'term'], ['expr', '-', 'term'], ], 'term': [ ['unary'], ['term', '*', 'unary'], ['term', '/', 'unary'], ], 'unary': [ ['prim'], ['-', 'unary'], ], 'prim': [ ['NUM'], ['VAR'], ['(', 'expr', ')'], ], } # The goal nonterminals are the nonterminals we're actually interested in # parsing. Here we want to parse expressions; all the other nonterminals # are interesting only as the building blocks of expressions. # # Variable terminals are terminal symbols that can have several different # values, like a VAR token that could be any identifier, or a NUM token # that could be any number. return Grammar(rules, goal_nts=['expr'], variable_terminals=['NUM', 'VAR']) Condition = typing.Tuple[str, bool] # A production consists of a left side, an optional condition, a right side, # and a reducer. A `Production` object includes everything except the left # side. Incorporating reducers lets us transform a grammar while preserving # behavior. # # The production `expr ::= term` is represented by # `Production(["term"], 0)`. # # The production `expr ::= expr + term => add($0, $2)` is represented by # `Production(["expr", "+", "term"], CallMethod("add", (0, 2))`. # @dataclass class Production: __slots__ = ['body', 'reducer', 'condition'] body: typing.List[Element] reducer: ReduceExprOrAccept condition: typing.Optional[Condition] def __init__(self, body: typing.List[Element], reducer: ReduceExprOrAccept, *, condition: typing.Optional[Condition] = None): self.body = body self.reducer = reducer self.condition = condition def __repr__(self) -> str: if self.condition is None: return "Production({!r}, reducer={!r})".format(self.body, self.reducer) else: return ("Production({!r}, reducer={!r}, condition={!r})" .format(self.body, self.reducer, self.condition)) def copy_with(self, **kwargs: typing.Any) -> Production: return dataclasses.replace(self, **kwargs) # *** Reduce expressions ****************************************************** # # Reduce expressions ("reducers" for short) are a little language used to # specify what happens when a production is matched. A reduce expression is # one of these: # # * An integer evaluates to one of the syntactic components of the # production. For example, if the production we're reducing is # `sum ::= sum + term`, the integer `0` evaluates the `sum` to the left of # the plus sign, and `2` means the `term` on the right. (The integer # `1` would refer to the `+` token itself, but that's not super useful.) # # These integers are not quite indexes into `production.body`, because # LookaheadRule, ErrorSymbol, and NoLineTerminatorHere elements don't # count: in the production `stmt ::= [lookahead != "let"] expr ";"`, `0` is # the expr, and `1` is the semicolon token. See `is_concrete_element(e)`. # # * CallMethod objects pass values to a builder method and return the result. # The `args` are nested reduce expressions. # # * None is an expression used as a placeholder when an optional symbol is # omitted. # # * Some(expr) is used when an optional symbol is found and parsed. # In Python, this just expands to the same thing as `expr`, but in Rust # this expands to a use of `Option::Some()`. # # In addition, the special reducer 'accept' means stop parsing. This is # used only in productions for init nonterminals, created automatically by # Grammar.__init__(). It's not a reduce expression, so it can't be nested. @dataclass(frozen=True) class CallMethod: """Express a method call, and give it a given set of arguments. A trait is added as the parser should implement this trait to call this method.""" method: str args: typing.Tuple[ReduceExpr, ...] trait: types.Type = types.Type("AstBuilder") fallible: bool = False @dataclass(frozen=True) class Some: inner: ReduceExpr def expr_to_str(expr: ReduceExprOrAccept) -> str: if isinstance(expr, int): return "${}".format(expr) elif isinstance(expr, CallMethod): return "{}::{}({}){}".format( expr.trait, expr.method, ', '.join(expr_to_str(arg) for arg in expr.args), expr.fallible and '?' or '') elif expr is None: return "None" elif isinstance(expr, Some): return "Some({})".format(expr_to_str(expr.inner)) elif expr == "accept": return "" else: raise ValueError("unrecognized expression: {!r}".format(expr)) # Type parameter for Grammar.intern(). Internable = typing.TypeVar("Internable") SyntheticTerminalsDict = typing.Dict[str, OrderedFrozenSet[str]] class Grammar: """A collection of productions. * self.variable_terminals - OrderedFrozenSet[str] - Terminals that carry data, like (in JS) numeric literals and RegExps. * self.terminals - OrderedFrozenSet[str] - All terminals used in the language, including those in self.variable_terminals and self.synthetic_terminals. * self.synthetic_terminals - {str: OrderedFrozenSet[str]} - Maps terminals to sets of terminals. An entry `name: set` in this dictionary means that `name` is shorthand for "one of the terminals in `set`". * self.nonterminals - {LenientNt: NtDef} - Keys are either (str|InitNt), early in the pipeline, or Nt objects later on. Values are the NtDef objects that contain the actual Productions. * self.methods - {str: MethodType} - Type information for methods. * self.init_nts - [InitNt or Nt] - The list of all elements of self.nonterminals.keys() that are init nonterminals. * self.exec_modes - DefaultDict{str, OrderedSet[Type]} or None - ? * self.type_to_mods - {Type: [str]} or None - ? * self._cache - {Any: Any} - Cache of immutable objects used by Grammar.intern(). """ variable_terminals: OrderedFrozenSet[str] terminals: OrderedFrozenSet[typing.Union[str, End]] synthetic_terminals: SyntheticTerminalsDict nonterminals: typing.Dict[LenientNt, NtDef] methods: typing.Dict[str, types.MethodType] init_nts: typing.List[typing.Union[Nt, InitNt]] exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]] _cache: typing.Dict[typing.Any, typing.Any] def __init__( self, nonterminals: typing.Mapping[LenientNt, LenientNtDef], *, goal_nts: typing.Optional[typing.Iterable[LenientNt]] = None, variable_terminals: typing.Iterable[str] = (), synthetic_terminals: SyntheticTerminalsDict = None, method_types: typing.Optional[typing.Dict[str, types.MethodType]] = None, exec_modes: typing.Optional[typing.DefaultDict[str, OrderedSet[types.Type]]] = None, type_to_modes: typing.Optional[typing.Mapping[types.Type, typing.List[str]]] = None): # This constructor supports passing in a sort of jumbled blob of # strings, lists, and actual objects, and normalizes it all to a more # typeful structure. Being able to interpret simple # "list-of-lists-of-strings" input is super useful for tests. # # We don't check here that the grammar is LR, that it's cycle-free, or # any other nice properties. # Copy/infer the arguments. my_nonterminals: typing.Dict[LenientNt, LenientNtDef] = dict(nonterminals.items()) if goal_nts is None: # Default to the first nonterminal in the dictionary. my_goal_nts = [] for name in my_nonterminals: my_goal_nts.append(name) break else: my_goal_nts = list(goal_nts) self.variable_terminals = OrderedFrozenSet(variable_terminals) if synthetic_terminals is None: synthetic_terminals = {} else: synthetic_terminals = { name: OrderedFrozenSet(set) for name, set in synthetic_terminals.items() } for synthetic_key, values in synthetic_terminals.items(): if synthetic_key in my_nonterminals: raise ValueError( "invalid grammar: {!r} is both a terminal and a nonterminal" .format(synthetic_key)) for t in values: if t in my_nonterminals: raise ValueError( "invalid grammar: {!r} is both ".format(t) + "a representation of a synthetic terminal and a nonterminal") if t in synthetic_terminals: # Nested synthetic terminals. Throw, for now. (This # should pretty much just work, except expand_terminal # doesn't support it; and we don't check for cycles # where a synthetic terminal includes itself directly # or indirectly.) raise ValueError( "unsupported: synthetic terminals can't include other " "synthetic terminals; {!r} includes {!r}" .format(synthetic_key, t)) # self.synthetic_terminals = synthetic_terminals self.synthetic_terminals = {} keys_are_nt = isinstance(next(iter(my_nonterminals)), Nt) key_type: typing.Union[typing.Type, typing.Tuple[typing.Type, ...]] key_type = Nt if keys_are_nt else (str, InitNt) self._cache = {} # Gather some information just by looking at keys (without examining # every production). # # str_to_nt maps the name of each non-parameterized # nonterminal to `Nt(name)`, a cache. str_to_nt: typing.Dict[typing.Union[str, InitNt], Nt] = {} # nt_params lists the names of each nonterminal's parameters (empty # tuple for non-parameterized nts). nt_params: typing.Dict[typing.Union[str, InitNt], typing.Tuple[str, ...]] = {} for key in my_nonterminals: if not isinstance(key, key_type): raise ValueError( "invalid grammar: conflicting key types in nonterminals dict - " "expected either all str or all Nt, got {!r}" .format(key.__class__.__name__)) nt_name: typing.Union[str, InitNt] param_names: typing.Tuple[str, ...] if keys_are_nt: assert isinstance(key, Nt) nt_name = key.name param_names = tuple(name for name, value in key.args) else: assert isinstance(key, (str, InitNt)) nt_name = key param_names = () my_nt = my_nonterminals[key] if isinstance(my_nt, NtDef): param_names = tuple(my_nt.params) if nt_name not in nt_params: nt_params[nt_name] = param_names else: if nt_params[nt_name] != param_names: raise ValueError( "conflicting parameter name lists for nt {!r}: " "both {!r} and {!r}" .format(nt_name, nt_params[nt_name], param_names)) if param_names == () and nt_name not in str_to_nt: str_to_nt[nt_name] = self.intern(Nt(nt_name)) # Validate, desugar, and copy the grammar. As a side effect, calling # validate_element on every element of the grammar populates # all_terminals. all_terminals: OrderedSet[typing.Union[str, End]] = OrderedSet(self.variable_terminals) all_terminals.add(End()) def note_terminal(t: str) -> None: """Add t (and all representations of it, if synthetic) to all_terminals.""" if t not in all_terminals: all_terminals.add(t) if t in self.synthetic_terminals: for k in self.synthetic_terminals[t]: note_terminal(k) # Note: i and j are normally list indexes, but they are sometimes the # special string '?'. It's OK because they are used only in error # messages. def validate_element( nt: LenientNt, i: typing.Union[int, str], j: typing.Union[int, str], e: Element, context_params: typing.Tuple[str, ...] ) -> Element: if isinstance(e, str): if e in nt_params: if nt_params[e] != (): raise ValueError( "invalid grammar: missing parameters for {!r} " "in production `grammar[{!r}][{}][{}].inner`: {!r}" .format(e, nt, i, j, nt_params[e])) return str_to_nt[e] else: note_terminal(e) return e elif isinstance(e, Optional): if not isinstance(e.inner, (str, Nt)): raise TypeError( "invalid grammar: unrecognized element " "in production `grammar[{!r}][{}][{}].inner`: {!r}" .format(nt, i, j, e.inner)) inner = validate_element(nt, i, j, e.inner, context_params) return self.intern(Optional(inner)) elif isinstance(e, Literal): if not isinstance(e.text, str): raise TypeError( "invalid grammar: unrecognized element " "in production `grammar[{!r}][{}][{}].text`: {!r}" .format(nt, i, j, e.text)) return self.intern(e) elif isinstance(e, UnicodeCategory): if not isinstance(e.cat_prefix, str): raise TypeError( "invalid grammar: unrecognized element " "in production `grammar[{!r}][{}][{}].cat_prefix`: {!r}" .format(nt, i, j, e.cat_prefix)) return self.intern(e) elif isinstance(e, Exclude): if not isinstance(e.inner, (str, Nt)): raise TypeError( "invalid grammar: unrecognized element " "in production `grammar[{!r}][{}][{}].inner`: {!r}" .format(nt, i, j, e.inner)) exclusion_list = [] for value in e.exclusion_list: if not isinstance(value, (str, Nt)): raise TypeError( "invalid grammar: unrecognized element " "in production `grammar[{!r}][{}][{}].exclusion_list`: {!r}" .format(nt, i, j, value)) value = validate_element(nt, i, j, value, context_params) exclusion_list.append(value) inner = validate_element(nt, i, j, e.inner, context_params) return self.intern(Exclude(inner, tuple(exclusion_list))) elif isinstance(e, Nt): # Either the application or the original parameterized # production must be present in the dictionary. if e not in my_nonterminals and e.name not in my_nonterminals: raise ValueError( "invalid grammar: unrecognized nonterminal " "in production `grammar[{!r}][{}][{}]`: {!r}" .format(nt, i, j, e.name)) args = tuple(pair[0] for pair in e.args) if e.name in nt_params and args != nt_params[e.name]: raise ValueError( "invalid grammar: wrong arguments passed to {!r} " "in production `grammar[{!r}][{}][{}]`: " "passed {!r}, expected {!r}" .format(e.name, nt, i, j, args, nt_params[e.name])) for param_name, arg_expr in e.args: if isinstance(arg_expr, Var): if arg_expr.name not in context_params: raise ValueError( "invalid grammar: undefined variable {!r} " "in production `grammar[{!r}][{}][{}]`" .format(arg_expr.name, nt, i, j)) return self.intern(e) elif isinstance(e, (LookaheadRule, End, ErrorSymbol)): return self.intern(e) elif e is NoLineTerminatorHere: return e elif isinstance(e, CallMethod): return self.intern(e) else: raise TypeError( "invalid grammar: unrecognized element in production " "`grammar[{!r}][{}][{}]`: {!r}" .format(nt, i, j, e)) assert False, "unreachable" def check_reduce_expr( nt: LenientNt, i: int, rhs: Production, expr: ReduceExprOrAccept) -> None: if isinstance(expr, int): concrete_len = sum(1 for e in rhs.body if is_concrete_element(e)) if not (0 <= expr < concrete_len): raise ValueError( "invalid grammar: element number {} out of range for " "production {!r} in grammar[{!r}][{}].reducer ({!r})" .format(expr, nt, rhs.body, i, rhs.reducer)) elif isinstance(expr, CallMethod): if not isinstance(expr.method, str): raise TypeError( "invalid grammar: method names must be strings, " "not {!r}, in grammar[{!r}[{}].reducer" .format(expr.method, nt, i)) if not expr.method.isidentifier(): name, space, pn = expr.method.partition(' ') if space == ' ' and name.isidentifier() and pn.isdigit(): pass else: raise ValueError( "invalid grammar: invalid method name {!r} " "(not an identifier), in grammar[{!r}[{}].reducer" .format(expr.method, nt, i)) for arg_expr in expr.args: check_reduce_expr(nt, i, rhs, arg_expr) elif expr is None: pass elif isinstance(expr, Some): check_reduce_expr(nt, i, rhs, expr.inner) else: raise TypeError( "invalid grammar: unrecognized reduce expression {!r} " "in grammar[{!r}][{}].reducer" .format(expr, nt, i)) def copy_rhs( nt: LenientNt, i: int, sole_production: bool, rhs: LenientProduction, context_params: typing.Tuple[str, ...]) -> Production: if isinstance(rhs, list): # Bare list, no reducer. Desugar to a Production, inferring a # reasonable default reducer. nargs = sum(1 for e in rhs if is_concrete_element(e)) reducer: ReduceExpr if len(rhs) == 1 and nargs == 1: reducer = 0 # don't call a method, just propagate the value else: # Call a method named after the production. If the # nonterminal has exactly one production, there's no need # to include the production index `i` to the method name. if sole_production: method = str(nt) else: method = '{}_{}'.format(nt, i) reducer = CallMethod(method, tuple(range(nargs))) rhs = Production(rhs, reducer) if not isinstance(rhs, Production): raise TypeError( "invalid grammar: grammar[{!r}][{}] should be " "a Production or list of grammar symbols, not {!r}" .format(nt, i, rhs)) if rhs.condition is not None: param, value = rhs.condition if param not in context_params: raise TypeError( "invalid grammar: undefined parameter {!r} " "in conditional for grammar[{!r}][{}]" .format(param, nt, i)) if rhs.reducer != 'accept': check_reduce_expr(nt, i, rhs, rhs.reducer) assert isinstance(rhs.body, list) return rhs.copy_with(body=[ validate_element(nt, i, j, e, context_params) for j, e in enumerate(rhs.body) ]) def copy_nt_def( nt: LenientNt, nt_def: typing.Union[NtDef, typing.List[LenientProduction]], ) -> NtDef: rhs_list: typing.Sequence[LenientProduction] if isinstance(nt_def, NtDef): for i, param in enumerate(nt_def.params): if not isinstance(param, str): raise TypeError( "invalid grammar: parameter {} of {} should be " "a string, not {!r}" .format(i + 1, nt, param)) params = nt_def.params rhs_list = nt_def.rhs_list ty = nt_def.type else: params = () rhs_list = nt_def ty = None if not isinstance(rhs_list, list): raise TypeError( "invalid grammar: grammar[{!r}] should be either a " "list of right-hand sides or NtDef, not {!r}" .format(nt, type(rhs_list).__name__)) sole_production = len(rhs_list) == 1 productions = [copy_rhs(nt, i, sole_production, rhs, params) for i, rhs in enumerate(rhs_list)] return NtDef(params, productions, ty) def check_nt_key(nt: LenientNt) -> None: if isinstance(nt, str): if not nt.isidentifier(): raise ValueError( "invalid grammar: nonterminal names must be identifiers, not {!r}" .format(nt)) if nt in self.variable_terminals or nt in self.synthetic_terminals: raise TypeError( "invalid grammar: {!r} is both a nonterminal and a variable terminal" .format(nt)) elif isinstance(nt, Nt): assert keys_are_nt # checked earlier if not (isinstance(nt.name, (str, InitNt)) and isinstance(nt.args, tuple)): raise TypeError( "invalid grammar: expected str or Nt(name=str, " "args=tuple) keys in nonterminals dict, got {!r}" .format(nt)) check_nt_key(nt.name) for pair in nt.args: if (not isinstance(pair, tuple) or len(pair) != 2 or not isinstance(pair[0], str) or not isinstance(pair[1], bool)): raise TypeError( "invalid grammar: expected tuple((str, bool)) args, got {!r}" .format(nt)) elif isinstance(nt, InitNt): # Users don't include init nonterminals when initially creating # a Grammar. They are automatically added below. But if this # Grammar is being created by hacking on a previous Grammar, it # will already have them. if not isinstance(nt.goal, Nt): raise TypeError( "invalid grammar: InitNt.goal should be a nonterminal, " "got {!r}" .format(nt)) # nt.goal is a "use", not a "def". Check it like a use. # Bogus question marks appear in error messages :-| validate_element(nt, '?', '?', nt.goal, ()) if nt.goal not in my_goal_nts: raise TypeError( "invalid grammar: nonterminal referenced by InitNt " "is not in the list of goals: {!r}" .format(nt)) else: raise TypeError( "invalid grammar: expected string keys in nonterminals dict, got {!r}" .format(nt)) def validate_nt( nt: LenientNt, nt_def: LenientNtDef ) -> typing.Tuple[LenientNt, NtDef]: check_nt_key(nt) if isinstance(nt, InitNt): # Check the form of init productions. Initially these look like # [[goal]], but after the pipeline goes to work, they can be # [[Optional(goal)]] or [[], [goal]]. if not isinstance(nt_def, NtDef): raise TypeError( "invalid grammar: key {!r} must map to " "value of type NtDef, not {!r}" .format(nt, nt_def)) rhs_list = nt_def.rhs_list g = nt.goal if (rhs_list != [Production([g], 0), Production([Nt(nt, ()), End()], 'accept')] and rhs_list != [Production([Optional(g)], 0), Production([Nt(nt, ()), End()], 'accept')] and rhs_list != [Production([End()], 'accept'), Production([g, End()], 'accept'), Production([Nt(nt, ()), End()], 'accept')]): raise ValueError( "invalid grammar: grammar[{!r}] is not one of " "the expected forms: got {!r}" .format(nt, rhs_list)) return nt, copy_nt_def(nt, nt_def) self.nonterminals = {} for nt1, nt_def1 in my_nonterminals.items(): nt, nt_def = validate_nt(nt1, nt_def1) self.nonterminals[nt] = nt_def for syn_term_name, t_set in synthetic_terminals.items(): nt_def = NtDef((syn_term_name,), [Production([e], 0) for e in t_set], None) nt, nt_def = validate_nt(syn_term_name, nt_def) self.nonterminals[nt] = nt_def # Remove synthetic terminals from the list of terminals. all_terminals.remove(syn_term_name) self.terminals = OrderedFrozenSet(all_terminals) # Check types of reduce expressions and infer method types. But if the # caller passed in precalculated type info, skip it -- otherwise we # would redo type checking many times as we make minor changes to the # Grammar along the pipeline. if method_types is None: types.infer_types(self) else: for nt, nt_def in self.nonterminals.items(): assert isinstance(nt_def, NtDef) assert isinstance(nt_def.type, types.Type) self.methods = method_types # Synthesize "init" nonterminals. self.init_nts = [] for goal in my_goal_nts: # Convert str goals to Nt objects and validate. if isinstance(goal, str): ok = goal in str_to_nt if ok: goal = str_to_nt[goal] elif isinstance(goal, Nt): if keys_are_nt: ok = goal in my_nonterminals else: ok = goal.name in my_nonterminals if not ok: raise ValueError( "goal nonterminal {!r} is undefined".format(goal)) assert isinstance(goal, Nt) # Weird, but the key of an init nonterminal really is # `Nt(InitNt(Nt(goal_name, goal_args)), ())`. It takes no arguments, # but it refers to a goal that might take arguments. init_nt = InitNt(goal) init_key: LenientNt = init_nt goal_nt = Nt(init_nt, ()) if keys_are_nt: init_key = goal_nt if init_key not in self.nonterminals: self.nonterminals[init_key] = NtDef( (), [Production([goal], 0), Production([goal_nt, End()], 'accept')], types.NoReturnType) self.init_nts.append(goal_nt) # Add the various execution backends which would rely on the same parse table. self.exec_modes = exec_modes self.type_to_modes = type_to_modes # The argument is a list of extension.GrammarExtensions. The annotation is # vague because this module does not import the extension module. It would # be a cyclic dependency. def patch(self, extensions: typing.List) -> None: assert self.type_to_modes is not None assert self.exec_modes is not None if extensions == []: return # Copy of nonterminals which would be mutated by the patches. nonterminals = copy.copy(self.nonterminals) for ext in extensions: # Add the given trait to the execution mode, depending on which # type it got implemented for. for mode in self.type_to_modes[ext.target.for_type]: self.exec_modes[mode].add(ext.target.trait) # Apply grammar transformations. ext.apply_patch(self, nonterminals) # Replace with the modified version of nonterminals self.nonterminals = nonterminals def intern(self, obj: Internable) -> Internable: """Return a shared copy of the immutable object `obj`. This saves memory and consistent use allows code to use `is` for equality testing. """ try: return self._cache[obj] except KeyError: self._cache[obj] = obj return obj # Terminals are tokens that must appear verbatim in the input wherever they # appear in the grammar, like the operators '+' '-' *' '/' and brackets '(' ')' # in the example grammar. def is_terminal(self, element: object) -> bool: return type(element) is str def expand_set_of_terminals( self, terminals: typing.Iterable[typing.Union[str, None, ErrorSymbol]] ) -> OrderedSet[typing.Union[str, None, ErrorSymbol]]: """Copy a set of terminals, replacing any synthetic terminals with their representations. Returns a new OrderedSet. terminals - an iterable of terminals and/or other unique elements like None or ErrorSymbol. """ result: OrderedSet[typing.Union[str, None, ErrorSymbol]] = OrderedSet() for t in terminals: if isinstance(t, str) and t in self.synthetic_terminals: result |= self.expand_set_of_terminals(self.synthetic_terminals[t]) else: result.add(t) return result def goals(self) -> typing.List[Nt]: """Return a list of this grammar's goal nonterminals.""" return [init_nt.name.goal for init_nt in self.init_nts] # type: ignore def with_nonterminals( self, nonterminals: typing.Mapping[LenientNt, LenientNtDef] ) -> Grammar: """Return a copy of self with the same attributes except different nonterminals.""" if self.methods is not None: for nt_def in nonterminals.values(): assert isinstance(nt_def, NtDef) assert nt_def.type is not None return Grammar( nonterminals, goal_nts=self.goals(), variable_terminals=self.variable_terminals, synthetic_terminals=self.synthetic_terminals, method_types=self.methods, exec_modes=self.exec_modes, type_to_modes=self.type_to_modes) # === A few methods for dumping pieces of grammar. def element_to_str(self, e: Element) -> str: if isinstance(e, Nt): return e.pretty() elif self.is_terminal(e): assert isinstance(e, str) if e in self.variable_terminals or e in self.synthetic_terminals: return e return '"' + repr(e)[1:-1] + '"' elif isinstance(e, Optional): return self.element_to_str(e.inner) + "?" elif isinstance(e, LookaheadRule): if len(e.set) == 1: op = "==" if e.positive else "!=" s = repr(list(e.set)[0]) else: op = "in" if e.positive else "not in" s = '{' + repr(list(e.set))[1:-1] + '}' return "[lookahead {} {}]".format(op, s) elif isinstance(e, End): return "" elif e is NoLineTerminatorHere: return "[no LineTerminator here]" elif isinstance(e, CallMethod): return "{{ {} }}".format(expr_to_str(e)) else: return str(e) def symbols_to_str(self, rhs: typing.Iterable[Element]) -> str: return " ".join(self.element_to_str(e) for e in rhs) def rhs_to_str(self, rhs: LenientProduction) -> str: if isinstance(rhs, Production): if rhs.condition is None: prefix = '' else: param, value = rhs.condition if value is True: condition = "+" + param elif value is False: condition = "~" + param else: condition = "{} == {!r}".format(param, value) prefix = "#[if {}] ".format(condition) return prefix + self.rhs_to_str(rhs.body) elif len(rhs) == 0: return "[empty]" else: return self.symbols_to_str(rhs) def nt_to_str(self, nt: LenientNt) -> str: if isinstance(nt, Nt): return self.element_to_str(nt) else: return str(nt) def production_to_str( self, nt: LenientNt, rhs: LenientProduction, *reducer: ReduceExpr ) -> str: # As we have two ways of representing productions at the moment, just # take multiple arguments :( return "{} ::= {}{}".format( self.nt_to_str(nt), self.rhs_to_str(rhs), "".join(" => " + expr_to_str(expr) for expr in reducer)) # The type of `item` is `lr0.LRItem`. No annotation because this module # does not import `lr0`. It would be a cyclic dependency. def lr_item_to_str(self, prods: typing.List, item: typing.Any) -> str: prod = prods[item.prod_index] if item.lookahead is None: la = [] else: la = [self.element_to_str(item.lookahead)] return "{} ::= {} >> {{{}}}".format( self.element_to_str(prod.nt), " ".join([self.element_to_str(e) for e in prod.rhs[:item.offset]] + ["\N{MIDDLE DOT}"] + la + [self.element_to_str(e) for e in prod.rhs[item.offset:]]), ", ".join( "$" if t is None else self.element_to_str(t) for t in item.followed_by) ) def item_set_to_str( self, prods: typing.List, item_set: OrderedFrozenSet ) -> str: return "{{{}}}".format( ", ".join(self.lr_item_to_str(prods, item) for item in item_set) ) def expand_terminal(self, t: str) -> OrderedFrozenSet[str]: return self.synthetic_terminals.get(t) or OrderedFrozenSet([t]) def compatible_elements(self, e1: Element, e2: Element) -> bool: # "type: ignore" because mypy doesn't know that `self.is_terminal(e1)` # means `e1` is a terminal, and thus `self.expand_terminal(e1)` is OK. return (e1 == e2 or (self.is_terminal(e1) and self.is_terminal(e2) and len(self.expand_terminal(e1) # type: ignore & self.expand_terminal(e2)) > 0)) # type: ignore def compatible_sequences( self, seq1: typing.Sequence[Element], seq2: typing.Sequence[Element]) -> bool: """True if the two sequences could be the same terminals.""" return (len(seq1) == len(seq1) and all(self.compatible_elements(e1, e2) for e1, e2 in zip(seq1, seq2))) def dump(self) -> None: for nt, nt_def in self.nonterminals.items(): left_side = self.nt_to_str(nt) if nt_def.params: left_side += "[" + ", ".join(nt_def.params) + "]" print(left_side + " ::=") for rhs in nt_def.rhs_list: print(" ", self.rhs_to_str(rhs)) print() def dump_type_info(self) -> None: for nt, nt_def in self.nonterminals.items(): print(nt, nt_def.type) for name, mty in self.methods.items(): print("fn {}({}) -> {}" .format(name, ", ".join(str(ty) for ty in mty.argument_types), str(mty.return_type))) def is_shifted_element(self, e: Element) -> bool: if isinstance(e, Nt): return True elif self.is_terminal(e): return True elif isinstance(e, Optional): return True elif isinstance(e, LookaheadRule): return False elif isinstance(e, End): return True elif e is NoLineTerminatorHere: return True return False @dataclass(frozen=True) class InitNt: """InitNt(goal) is the name of the init nonterminal for the given goal. One init nonterminal is created internally for each goal symbol in the grammar. The idea is to have a nonterminal that the user has no control over, that is never used in any production, but *only* as an entry point for the grammar, that always has a single production "init_nt ::= goal_nt". This predictable structure makes it easier to get into and out of parsing at run time. When an init nonterminal is matched, we take the "accept" action rather than a "reduce" action. """ goal: Nt # *** Elements **************************************************************** # # Elements are the things that can appear in the .body list of a Production: # # * Strings represent terminals (see `Grammar.is_terminal`) # # * `Nt` objects refer to nonterminals. # # * `Optional` objects represent optional elements. # # * `LookaheadRule` objects are like lookahead assertions in regular # expressions. # # * The `NoLineTerminatorHere` singleton object can appear between two other # symbols to rule out line breaks between them. # # * `ErrorSymbol` objects never match anything produced by the lexer. Instead # they match an ErrorToken that's artificially injected into the token # stream at runtime, by the parser itself, just before a token that does # not match anything else. def is_concrete_element(e: Element) -> bool: """True if parsing the element `e` produces a value. A production's concrete elements can be used in reduce expressions. """ return not isinstance(e, (LookaheadRule, ErrorSymbol, NoLineTerminatorHereClass)) # Nonterminals in the ECMAScript grammar can be parameterized; NtParameter is # the type of the parameters. # # For example, `BindingIdentifier[?Yield, ?Await]` is represented as # `Nt('BindingIdentifier', (('Yield', Var('Yield')), ('Await', Var('Await'))))`. # # A nonterminal-parameter-expression is represented by either a Var object or # the actual value, a boolean. (In theory, parameters don't *have* to be # boolean; all the code would probably work for anything hashable. In practice, # all parameters in the ECMAScript grammar are boolean.) NtParameter = typing.Hashable class Nt: """Nt(name, ((param0, arg0), ...)) - An invocation of a nonterminal. Nonterminals are like lambdas. Each nonterminal in a grammar is defined by an NtDef which has 0 or more parameters. Parameter names `param0...` are strings. The actual arguments `arg0...` are NtParameters (see above). """ __slots__ = ['name', 'args'] name: typing.Union[str, InitNt] args: typing.Tuple[typing.Tuple[str, NtParameter], ...] def __init__(self, name: typing.Union[str, InitNt], args: typing.Tuple[typing.Tuple[str, NtParameter], ...] = ()): self.name = name self.args = args def __hash__(self) -> int: return hash(('nt', self.name, self.args)) def __eq__(self, other: object) -> bool: return (isinstance(other, Nt) and (self.name, self.args) == (other.name, other.args)) def __repr__(self) -> str: if self.args: return 'Nt({!r}, {!r})'.format(self.name, self.args) else: return 'Nt({!r})'.format(self.name) def pretty(self) -> str: """Unique version of this Nt to use in the Python runtime. Also used in debug/verbose output. """ def arg_to_str(name: str, value: NtParameter) -> str: if value is True: return '+' + name elif value is False: return '~' + name elif isinstance(value, Var): if value.name == name: return '?' + value.name return name + "=" + value.name else: return name + "=" + repr(value) if isinstance(self.name, InitNt): return "Start_" + self.name.goal.pretty() if len(self.args) == 0: return self.name return "{}[{}]".format(self.name, ", ".join(arg_to_str(name, value) for name, value in self.args)) @dataclass(frozen=True) class Optional: """Optional(nt) matches either nothing or the given nt. Optional elements are expanded out before states are calculated, so the core of the algorithm never sees them. """ inner: Element @dataclass(frozen=True) class Literal: """Literal(str) matches a sequence of characters. Literal elements are sequences of characters which are expected to appear verbatim in the input. """ text: str @dataclass(frozen=True) class UnicodeCategory: """UnicodeCategory(str) matches any character with a category matching the cat_prefix. UnicodeCategory elements are a set of literal elements which correspond to a given unicode cat_prefix. """ cat_prefix: str @dataclass(frozen=True) class LookaheadRule: """LookaheadRule(set, pos) imposes a lookahead restriction on whatever follows. It never consumes any tokens itself. Instead, the right-hand side [LookaheadRule(frozenset(['a', 'b']), False), 'Thing'] matches a Thing that does not start with the token `a` or `b`. """ set: typing.FrozenSet[str] positive: bool # A lookahead restriction really just specifies a set of allowed terminals. # # - No lookahead restriction at all is equivalent to a rule specifying all terminals. # # - A positive lookahead restriction explicitly lists all allowed tokens. # # - A negative lookahead restriction instead specifies the set of all tokens # except a few. # def lookahead_contains(rule: typing.Optional[LookaheadRule], t: str) -> bool: """True if the given lookahead restriction `rule` allows the terminal `t`.""" return (rule is None or (t in rule.set if rule.positive else t not in rule.set)) def lookahead_intersect( a: typing.Optional[LookaheadRule], b: typing.Optional[LookaheadRule] ) -> typing.Optional[LookaheadRule]: """Returns a single rule enforcing both `a` and `b`, allowing only terminals that pass both.""" if a is None: return b elif b is None: return a elif a.positive: if b.positive: return LookaheadRule(a.set & b.set, True) else: return LookaheadRule(a.set - b.set, True) else: if b.positive: return LookaheadRule(b.set - a.set, True) else: return LookaheadRule(a.set | b.set, False) class NoLineTerminatorHereClass: def __str__(self) -> str: return 'NoLineTerminatorHere' NoLineTerminatorHere = NoLineTerminatorHereClass() # Optional elements. These are expanded out before states are calculated, # so the core of the algorithm never sees them. @dataclass(frozen=True) class Exclude: """Exclude(nt1, nt2) matches if nt1 matches and nt2 does not.""" inner: Element exclusion_list: typing.Tuple[Element, ...] # End. This is used to represent the terminal which is infinitely produced by # the lexer when input end is reached. @dataclass(frozen=True) class End: """End() represents the end of the input content.""" # Special grammar symbol that can be consumed to handle a syntax error. # # The error code is passed to an error-handling routine at run time which # decides if the error is recoverable or not. @dataclass(frozen=True) class ErrorSymbol: """Special grammar symbol that can be consumed to handle a syntax error.""" error_code: int Element = typing.Union[ str, Optional, Literal, UnicodeCategory, Exclude, Nt, LookaheadRule, End, ErrorSymbol, NoLineTerminatorHereClass, CallMethod] @dataclass class NtDef: """Definition of a nonterminal. Instances have three attributes: .params - Tuple of strings, the names of the parameters. .rhs_list - List of Production objects. Arguments to Nt elements in the productions can be Var(s) where `s in params`, indicating that parameter should be passed through unchanged. .type - The type of runtime value produced by parsing an instance of this nonterminal, or None. An NtDef is a sort of lambda. Some langauges have constructs that are allowed or disallowed in particular situations. For example, in many languages `return` statements are allowed only inside functions or methods. The ECMAScript standard (5.1.5 "Grammar Notation") offers this example of the notation it uses to specify this sort of thing: StatementList [Return] : [+Return] ReturnStatement ExpressionStatement This is an abbreviation for: StatementList : ExpressionStatement StatementList_Return : ReturnStatement ExpressionStatement We offer NtDef.params as a way of representing this in our system. "StatementList": NtDef(("Return",), [ Production(["ReturnStatement"], condition=("Return", True)), ["ExpressionStatement"], ], None), This is an abbreviation for: "StatementList_0": [ ["ExpressionStatement"], ], "StatementList_1": [ ["ReturnStatement"], ["ExpressionStatement"], ], """ __slots__ = ['params', 'rhs_list', 'type'] params: typing.Tuple[str, ...] rhs_list: typing.List[Production] type: typing.Optional[types.Type] def with_rhs_list(self, new_rhs_list: typing.List[Production]) -> NtDef: return dataclasses.replace(self, rhs_list=new_rhs_list) @dataclass(frozen=True) class Var: """Var(name) represents the run-time value of the parameter with the given name.""" name: str ReduceExpr = typing.Union[int, CallMethod, None, Some] ReduceExprOrAccept = typing.Union[ReduceExpr, str] # The Grammar constructor is very lax about the types you pass to it. It can # accept a `Dict[str, List[List[str]]]`, for example; it copies the data into # NtDef and Production objects. # # The `Lenient` types below are the relaxed types that Grammar() actually # accepts. LenientNt = typing.Union[Nt, str, InitNt] LenientProduction = typing.Union[Production, typing.List[Element]] LenientNtDef = typing.Union[NtDef, typing.List[LenientProduction]]