summaryrefslogtreecommitdiffstats
path: root/third_party/rust/jsparagus/jsparagus/lr0.py
blob: 5e4364df3c5f47e9a07f6d8f92b23d8350cdf0b1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
"""Generate a simple LR0 state graph from a CanonicalGrammar.

The resulting graph may contain inconsistent states, which must be resolved by the
ParseTable before a parser can be generated.
"""

from __future__ import annotations
# mypy: disallow-untyped-defs, disallow-incomplete-defs, disallow-untyped-calls

import collections
from dataclasses import dataclass
import typing

from .actions import (Accept, Action, CheckNotOnNewLine, FunCall, Lookahead,
                      OutputExpr, Unwind, Reduce, Seq)
from .ordered import OrderedFrozenSet
from .grammar import (CallMethod, Element, End, ErrorSymbol, Grammar,
                      LookaheadRule, NoLineTerminatorHere, Nt, ReduceExpr,
                      ReduceExprOrAccept, Some)
from .rewrites import CanonicalGrammar, Prod
from . import types


# ## LR parsers: Why?
#
# Consider a single production `expr ::= expr "+" term` being parsed in a
# recursive descent parser. As we read the source left to right, our parser's
# internal state looks like this (marking our place with a dot):
#
#     expr ::= · expr "+" term
#     expr ::= expr · "+" term
#     expr ::= expr "+" · term
#     expr ::= expr "+" term ·
#
# As we go, we build an AST. First we parse an *expr* and temporarily set it
# aside. Then we expect to see a `+` operator. Then we parse a *term*. Then,
# having got to the end, we create an AST node for the whole addition
# expression.
#
# Since the grammar is nested, at run time we really have a stack of these
# intermediate states.
#
# But how do we decide which production we should be matching? Often the first
# token just tells us: the `while` keyword means there's a `while` statement
# coming up. Grammars in which this is always the case are called LL(1). But
# while it's possible to wrangle *most* of the ES grammar into an LL(1) form,
# not everything works out. For example, here's the ES assignment syntax (much
# simplified):
#
#     assignment ::= sum
#     assignment ::= primitive "=" assignment
#     sum ::= primitive
#     sum ::= sum "+" primitive
#     primitive ::= VAR
#
# Note that the bogus assignment `a + b = c` doesn't parse because `a + b`
# isn't a primitive.
#
# Suppose we want to parse an expression, and the first token is `a`. We don't
# know yet which *assignment* production to use. So this grammar is not in
# LL(1).
#
#
# ## LR parsers: How
#
# An LR parser generator allows for a *superposition* of states. While parsing,
# we can sometimes have multiple productions at once that might match. It's
# like how in quantum theory, Schrödinger’s cat can tentatively be both alive
# and dead until decisive information is observed.
#
# As we read `a = b + c`, our parser's internal state is like this
# (eliding a few steps, like how we recognize that `a` is a primitive):
#
#     current point in input  superposed parser state
#     ----------------------  -----------------------
#     · a = b + c             assignment ::= · sum
#                             assignment ::= · primitive "=" assignment
#
#       (Then, after recognizing that `a` is a *primitive*...)
#
#     a · = b + c             sum ::= primitive ·
#                             assignment ::= primitive · "=" assignment
#
#       (The next token, `=`, rules out the first alternative,
#       collapsing the waveform...)
#
#     a = · b + c             assignment ::= primitive "=" · assignment
#
#       (After recognizing that `b` is a primitive, we again have options:)
#
#     a = b · + c             sum ::= primitive ·
#                             assignment ::= primitive · "=" assignment
#
# And so on. We call each dotted production an "LR item", and the superposition
# of several LR items is called a "state".  (It is not meant to be clear yet
# just *how* the parser knows which rules might match.)
#
# Since the grammar is nested, at run time we'll have a stack of these parser
# state superpositions.
#
# The uncertainty in LR parsing means that code for an LR parser written by
# hand, in the style of recursive descent, would read like gibberish. What we
# can do instead is generate a parser table.


@dataclass(frozen=True, order=True)
class LRItem:
    """A snapshot of progress through a single specific production.

    *   `prod_index` identifies the production. (Every production in the grammar
        gets a unique index; see the loop that computes
        prods_with_indexes_by_nt.)

    *   `offset` is the position of the cursor within the production.

    `lookahead` and `followed_by` are two totally different kinds of lookahead.

    *   `lookahead` is the LookaheadRule, if any, that applies to the immediately
        upcoming input. It is present only if this LRItem is subject to a
        `[lookahead]` restriction; otherwise it's None. These restrictions can't
        extend beyond the end of a production, or else the grammar is invalid.
        This implements the lookahead restrictions in the ECMAScript grammar.
        It is not part of any account of LR I've seen.

    *   `followed_by` is a completely different kind of lookahead restriction.
        This is the kind of lookahead that is a central part of canonical LR
        table generation.  It applies to the token *after* the whole current
        production, so `followed_by` always applies to completely different and
        later tokens than `lookahead`.  `followed_by` is a set of terminals; if
        `None` is in this set, it means `END`, not that the LRItem is
        unrestricted.
    """

    prod_index: int
    offset: int
    lookahead: typing.Optional[LookaheadRule]
    followed_by: OrderedFrozenSet[typing.Optional[str]]


# A Term is the label on an edge from one state to another. It's normally a
# terminal, nonterminal, or epsilon action. A state can also have a special
# catchall edge, labeled with an ErrorSymbol.
ShiftedTerm = typing.Union[str, Nt, ErrorSymbol]
Term = typing.Union[ShiftedTerm, Action]


def on_stack(grammar: Grammar, term: Element) -> bool:
    """Returns whether an element of a production is consuming stack space or
    not."""
    if isinstance(term, Nt):
        return True
    elif grammar.is_terminal(term):
        return True
    elif isinstance(term, LookaheadRule):
        return False
    elif isinstance(term, ErrorSymbol):
        return True
    elif isinstance(term, End):
        return True
    elif term is NoLineTerminatorHere:
        # No line terminator is a property of the next token being shifted. It
        # is implemented as an action which once shifted past the next term,
        # will check whether the previous term shifted is on a new line.
        return False
    elif isinstance(term, CallMethod):
        return False
    raise ValueError(term)


def callmethods_to_funcalls(
        expr: ReduceExprOrAccept,
        pop: int,
        ret: str,
        depth: int,
        funcalls: typing.List[Action]
) -> OutputExpr:
    """Lower a reduce-expression to the OutputExpr language.

    CallMethod expressions are replaced with FunCalls; all new FunCalls created
    in this way are appended to `funcalls`.
    """

    if isinstance(expr, int):
        stack_index = pop - expr
        if depth == 0:
            call = FunCall("id", (stack_index,), fallible=False,
                           trait=types.Type("AstBuilder"), set_to=ret)
            funcalls.append(call)
            return ret
        else:
            return stack_index
    elif isinstance(expr, Some):
        res = callmethods_to_funcalls(expr.inner, pop, ret, depth, funcalls)
        # "type: ignore" because Some is not generic, unfortunately.
        return Some(res)  # type: ignore
    elif expr is None:
        return None
    elif isinstance(expr, CallMethod):
        def convert_args(args: typing.Iterable[ReduceExpr]) -> typing.Iterator[OutputExpr]:
            for i, arg in enumerate(args):
                yield callmethods_to_funcalls(arg, pop, ret + "_{}".format(i), depth + 1, funcalls)
        args = tuple(convert_args(expr.args))
        call = FunCall(expr.method, args,
                       trait=expr.trait,
                       fallible=expr.fallible,
                       set_to=ret)
        funcalls.append(call)
        return ret
    elif expr == "accept":
        funcalls.append(Accept())
        return ret
    else:
        raise ValueError(expr)


class LR0Generator:
    """Provide a way to iterate over the grammar, given a set of LR items."""
    __slots__ = [
        "grammar",
        "lr_items",
        "key",
        "_hash",
    ]

    grammar: CanonicalGrammar
    lr_items: OrderedFrozenSet[LRItem]
    key: str
    _hash: int

    def __init__(
            self,
            grammar: CanonicalGrammar,
            lr_items: typing.Iterable[LRItem] = ()
    ) -> None:
        self.grammar = grammar
        self.lr_items = OrderedFrozenSet(lr_items)
        # This is used to reuse states which have already been encoded.
        self.key = "".join(repr((item.prod_index, item.offset)) + "\n"
                           for item in sorted(self.lr_items))
        self._hash = hash(self.key)

    def __eq__(self, other: object) -> bool:
        return isinstance(other, LR0Generator) and self.key == other.key

    def __hash__(self) -> int:
        return self._hash

    def __str__(self) -> str:
        s = ""
        for lr_item in self.lr_items:
            s += self.grammar.grammar.lr_item_to_str(self.grammar.prods, lr_item)
            s += "\n"
        return s

    def stable_locations(self) -> OrderedFrozenSet[str]:
        locations = []
        for lr_item in self.lr_items:
            locations.append(self.grammar.grammar.lr_item_to_str(self.grammar.prods, lr_item))
        return OrderedFrozenSet(sorted(locations))

    @staticmethod
    def start(grammar: CanonicalGrammar, nt: Nt) -> LR0Generator:
        lr_items: typing.List[LRItem] = []
        # Visit the initial non-terminal, as well as all the non-terminals
        # which are at the left of each productions.
        todo: typing.Deque[Nt] = collections.deque()
        visited_nts = []
        todo.append(nt)
        while todo:
            nt = todo.popleft()
            if nt in visited_nts:
                continue
            visited_nts.append(nt)
            for prod_index, _ in grammar.prods_with_indexes_by_nt[nt]:
                assert isinstance(prod_index, int)
                lr_items.append(LRItem(
                    prod_index=prod_index,
                    offset=0,
                    lookahead=None,
                    followed_by=OrderedFrozenSet(),
                ))

                prod = grammar.prods[prod_index]
                assert isinstance(prod, Prod)
                try:
                    term = prod.rhs[0]
                    if isinstance(term, Nt):
                        todo.append(term)
                except IndexError:
                    pass
        return LR0Generator(grammar, lr_items)

    def transitions(self) -> typing.Dict[Term, LR0Generator]:
        """Returns the dictionary which maps the state transitions with the next
        LR0Generators. This can be used to generate the states and the
        transitions between the states of an LR0 parse table."""
        followed_by: typing.DefaultDict[Term, typing.List[LRItem]]
        followed_by = collections.defaultdict(list)
        for lr_item in self.lr_items:
            self.item_transitions(lr_item, followed_by)

        return {k: LR0Generator(self.grammar, lr_items)
                for k, lr_items in followed_by.items()}

    def item_transitions(
            self,
            lr_item: LRItem,
            followed_by: typing.DefaultDict[Term, typing.List[LRItem]]
    ) -> None:
        """Given one LRItem, register all the transitions and LR Items reachable
        through these transitions."""
        prod = self.grammar.prods[lr_item.prod_index]
        assert isinstance(prod, Prod)

        # Read the term located at the offset in the production.
        if lr_item.offset < len(prod.rhs):
            term = prod.rhs[lr_item.offset]
            if isinstance(term, Nt):
                pass
            elif self.grammar.grammar.is_terminal(term):
                pass
            elif isinstance(term, LookaheadRule):
                term = Lookahead(term.set, term.positive)
            elif isinstance(term, ErrorSymbol):
                # ErrorSymbol as considered as terminals. These terminals would
                # be produced by the error handling code which produces these
                # error symbols on-demand.
                pass
            elif isinstance(term, End):
                # End is considered as a terminal which is produduced once by
                # the lexer upon reaching the end. However, the parser might
                # finish without consuming the End terminal, if there is no
                # ambiguity on whether the End is expected.
                pass
            elif term is NoLineTerminatorHere:
                # Check whether the following terminal is on a new line. If
                # not, this would produce a syntax error. The argument is the
                # terminal offset.
                term = CheckNotOnNewLine()
            elif isinstance(term, CallMethod):
                funcalls: typing.List[Action] = []
                pop = sum(1 for e in prod.rhs[:lr_item.offset] if on_stack(self.grammar.grammar, e))
                callmethods_to_funcalls(term, pop, "expr", 0, funcalls)
                term = Seq(funcalls)

        elif lr_item.offset == len(prod.rhs):
            # Add the reduce operation as a state transition in the generated
            # parse table. (TODO: this supposed that the canonical form did not
            # move the reduce action to be part of the production)
            pop = sum(1 for e in prod.rhs if on_stack(self.grammar.grammar, e))
            term = Reduce(Unwind(prod.nt, pop))
            expr = prod.reducer
            if expr is not None:
                funcalls = []
                callmethods_to_funcalls(expr, pop, "value", 0, funcalls)
                term = Seq(funcalls + [term])
        else:
            # No edges after the reduce operation.
            return

        # Add terminals, non-terminals and lookahead actions, as transitions to
        # the next LR Item.
        new_transition = term not in followed_by
        followed_by[term].append(LRItem(
            prod_index=lr_item.prod_index,
            offset=lr_item.offset + 1,
            lookahead=None,
            followed_by=OrderedFrozenSet(),
        ))

        # If the term is a non-terminal, then recursively add transitions from
        # the beginning of all the productions which are matching this
        # non-terminal.
        #
        # Only do it once per non-terminal to avoid infinite recursion on
        # left-recursive grammars.
        if isinstance(term, Nt) and new_transition:
            for prod_index, _ in self.grammar.prods_with_indexes_by_nt[term]:
                assert isinstance(prod_index, int)
                self.item_transitions(LRItem(
                    prod_index=prod_index,
                    offset=0,
                    lookahead=None,
                    followed_by=OrderedFrozenSet(),
                ), followed_by)