diff options
Diffstat (limited to 'third_party/rust/jsparagus/jsparagus/parse_pgen.py')
-rwxr-xr-x | third_party/rust/jsparagus/jsparagus/parse_pgen.py | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/third_party/rust/jsparagus/jsparagus/parse_pgen.py b/third_party/rust/jsparagus/jsparagus/parse_pgen.py new file mode 100755 index 0000000000..99545b6804 --- /dev/null +++ b/third_party/rust/jsparagus/jsparagus/parse_pgen.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python + +"""parse_pgen.py - Parse grammars written in the pgen parser specification language. + +I'm not sure I want to keep this pgen mini-language around; ignore this for now. +""" + +import sys +from collections import namedtuple + +from .lexer import LexicalGrammar +from .grammar import Grammar, Production, CallMethod, is_concrete_element, Optional +from . import gen +from . import parse_pgen_generated + + +pgen_lexer = LexicalGrammar( + "goal nt var token { } ; ? = => ( ) ,", + r'([ \t\r\n]|#.*)*', + IDENT=r'[A-Za-z_](?:\w|[_-])*', + STR=r'"[^\\\n"]*"', + MATCH=r'\$(?:0|[1-9][0-9]*)', + COMMENT=r'//.*', +) + + +def list_of(e, allow_comments=False): + nt = e + 's' + prods = [ + Production([e], CallMethod('single', (0,))), + Production([nt, e], CallMethod('append', (0, 1))), + ] + if allow_comments: + prods.append(Production(['COMMENT'], CallMethod('empty', (0,)))) + return prods + + +def call_method(name, body): + arg_indexes = [] + current = 0 + for e in body: + if is_concrete_element(e): + if e not in discards: + arg_indexes.append(current) + current += 1 + + return CallMethod(name, tuple(arg_indexes)) + + +def prod(body, reducer): + if isinstance(reducer, str): + reducer = call_method(reducer, body) + return Production(body, reducer) + + +discards = set('token var nt goal Some None = => ; ( ) { } , ?'.split()) + +pgen_grammar = Grammar( + { + 'grammar': [ + [Optional('token_defs'), 'nt_defs'] + ], + 'token_defs': list_of('token_def'), + 'token_def': [ + prod(['token', 'IDENT', '=', 'STR', ';'], 'const_token'), + prod(['var', 'token', 'IDENT', ';'], 'var_token'), + ], + 'nt_defs': [ + prod(['nt_def'], 'nt_defs_single'), + prod(['nt_defs', 'nt_def'], 'nt_defs_append'), + ], + 'nt_def': [ + prod([Optional('COMMENT'), Optional('goal'), 'nt', 'IDENT', '{', + Optional('prods'), '}'], 'nt_def'), + ], + 'prods': list_of('prod', allow_comments=True), + 'prod': [ + prod(['terms', Optional('reducer'), ';'], 'prod'), + ], + 'terms': list_of('term'), + 'term': [ + ['symbol'], + prod(['symbol', '?'], 'optional'), + ], + 'symbol': [ + prod(['IDENT'], 'ident'), + prod(['STR'], 'str'), + ], + 'reducer': [ + prod(['=>', 'expr'], 1) + ], + 'expr': [ + prod(['MATCH'], 'expr_match'), + prod(['IDENT', '(', Optional('expr_args'), ')'], 'expr_call'), + prod(['Some', '(', 'expr', ')'], 'expr_some'), + prod(['None'], 'expr_none'), + ], + 'expr_args': [ + prod(['expr'], 'args_single'), + prod(['expr_args', ',', 'expr'], 'args_append'), + ], + }, + goal_nts=['grammar'], + variable_terminals=['IDENT', 'STR', 'MATCH', 'COMMENT'] +) + + +Literal = namedtuple("Literal", "chars") + +default_token_list = [ + ("Var", "var"), + ("Token", "token"), + ("Goal", "goal"), + ("Nt", "nt"), + ("IDENT", None), + ("STR", None), + ("OpenBrace", "{"), + ("CloseBrace", "}"), + ("OpenParenthesis", "("), + ("CloseParenthesis", ")"), + ("Colon", ":"), + ("EqualSign", "="), + ("Asterisk", "*"), + ("PlusSign", "+"), + ("MinusSign", "-"), + ("Slash", "/"), + ("Semicolon", ";"), + ("QuestionMark", "?"), + ("RightArrow", "->"), + ("Comma", ","), +] + + +class AstBuilder: + def grammar(self, token_defs, nt_defs): + nonterminals, goal_nts = nt_defs + return (token_defs or default_token_list, nonterminals, goal_nts) + + def empty(self, value): + return [] + + def single(self, value): + return [value] + + def append(self, values, value): + values.append(value) + return values + + def const_token(self, name, picture): + assert picture[0] == '"' + assert picture[-1] == '"' + return (name, picture[1:-1]) + + def var_token(self, name): + return (name, None) + + def comment(self, comment): + pass + + def nt_defs_single(self, nt_def): + return self.nt_defs_append(({}, []), nt_def) + + def nt_defs_append(self, grammar_in, nt_def): + is_goal, nt, prods = nt_def + grammar, goal_nts = grammar_in + if nt in grammar: + raise ValueError("multiple definitions for nt {}".format(nt)) + grammar[nt] = prods + if is_goal: + goal_nts.append(nt) + return grammar, goal_nts + + def nt_def(self, _comment, goal_kw, ident, prods): + is_goal = goal_kw == "goal" + prods = [Production(body, reducer) for body, reducer in prods] + return (is_goal, ident, prods) + + def prod(self, symbols, reducer): + if reducer is None: + if sum(1 for e in symbols if is_concrete_element(e)) == 1: + reducer = 0 + else: + raise ValueError("reducer required for {!r}".format(symbols)) + return (symbols, reducer) + + def optional(self, sym): + return Optional(sym) + + def ident(self, sym): + return sym + + def str(self, sym): + assert len(sym) > 1 + assert sym[0] == '"' + assert sym[-1] == '"' + chars = sym[1:-1] # This is a bit sloppy. + return Literal(chars) + + def expr_match(self, match): + assert match.startswith('$') + return int(match[1:]) + + def expr_call(self, ident, args): + return CallMethod(ident, tuple(args or ())) + + def args_single(self, expr): + return [expr] + + def args_append(self, args, arg): + args.append(arg) + return args + + +def check_grammar(result): + tokens, nonterminals, goal_nts = result + tokens_by_name = {} + tokens_by_image = {} + for name, image in tokens: + if name in tokens_by_name: + raise ValueError("token `{}` redeclared".format(name)) + tokens_by_name[name] = image + if image is not None and image in tokens_by_image: + raise ValueError("multiple tokens look like \"{}\"".format(image)) + tokens_by_image[image] = name + if name in nonterminals: + raise ValueError("`{}` is declared as both a token and a nonterminal (pick one)".format(name)) + + def check_element(nt, i, e): + if isinstance(e, Optional): + return Optional(check_element(nt, i, e.inner)) + elif isinstance(e, Literal): + if e.chars not in tokens_by_image: + raise ValueError("in {} production {}: undeclared token \"{}\"".format(nt, i, e.chars)) + return e.chars + else: + assert isinstance(e, str), e.__class__.__name__ + if e in nonterminals: + return e + elif e in tokens_by_name: + image = tokens_by_name[e] + if image is not None: + return image + return e + else: + raise ValueError("in {} production {}: undeclared symbol {}".format(nt, i, e)) + + out = {nt: [] for nt in nonterminals} + for nt, rhs_list in nonterminals.items(): + for i, p in enumerate(rhs_list): + out_rhs = [check_element(nt, i, e) for e in p.body] + out[nt].append(p.copy_with(body=out_rhs)) + + return (tokens, out, goal_nts) + + +def load_grammar(filename): + with open(filename) as f: + text = f.read() + parser = parse_pgen_generated.Parser(builder=AstBuilder()) + lexer = pgen_lexer(parser, filename=filename) + lexer.write(text) + result = lexer.close() + tokens, nonterminals, goals = check_grammar(result) + variable_terminals = [name for name, image in tokens if image is None] + return Grammar(nonterminals, + goal_nts=goals, + variable_terminals=variable_terminals) + + +def regenerate(): + import sys + gen.generate_parser(sys.stdout, pgen_grammar) + + +if __name__ == '__main__': + if sys.argv[1:] == ['--regenerate']: + regenerate() + else: + print("usage: python -m jsparagus.parse_pgen --regenerate") + sys.exit(1) |