diff options
Diffstat (limited to 'third_party/python/fluent.syntax/fluent/syntax/parser.py')
-rw-r--r-- | third_party/python/fluent.syntax/fluent/syntax/parser.py | 701 |
1 files changed, 701 insertions, 0 deletions
diff --git a/third_party/python/fluent.syntax/fluent/syntax/parser.py b/third_party/python/fluent.syntax/fluent/syntax/parser.py new file mode 100644 index 0000000000..87075409f1 --- /dev/null +++ b/third_party/python/fluent.syntax/fluent/syntax/parser.py @@ -0,0 +1,701 @@ +import re +from typing import Any, Callable, List, Set, TypeVar, Union, cast +from . import ast +from .stream import EOL, FluentParserStream +from .errors import ParseError + +R = TypeVar("R", bound=ast.SyntaxNode) + + +def with_span(fn: Callable[..., R]) -> Callable[..., R]: + def decorated(self: 'FluentParser', ps: FluentParserStream, *args: Any, **kwargs: Any) -> Any: + if not self.with_spans: + return fn(self, ps, *args, **kwargs) + + start = ps.index + node = fn(self, ps, *args, **kwargs) + + # Don't re-add the span if the node already has it. This may happen + # when one decorated function calls another decorated function. + if node.span is not None: + return node + + end = ps.index + node.add_span(start, end) + return node + + return decorated + + +class FluentParser: + """This class is used to parse Fluent source content. + + ``with_spans`` enables source information in the form of + :class:`.ast.Span` objects for each :class:`.ast.SyntaxNode`. + """ + + def __init__(self, with_spans: bool = True): + self.with_spans = with_spans + + def parse(self, source: str) -> ast.Resource: + """Create a :class:`.ast.Resource` from a Fluent source. + """ + ps = FluentParserStream(source) + ps.skip_blank_block() + + entries: List[ast.EntryType] = [] + last_comment = None + + while ps.current_char: + entry = self.get_entry_or_junk(ps) + blank_lines = ps.skip_blank_block() + + # Regular Comments require special logic. Comments may be attached + # to Messages or Terms if they are followed immediately by them. + # However they should parse as standalone when they're followed by + # Junk. Consequently, we only attach Comments once we know that the + # Message or the Term parsed successfully. + if isinstance(entry, ast.Comment) and len(blank_lines) == 0 \ + and ps.current_char: + # Stash the comment and decide what to do with it + # in the next pass. + last_comment = entry + continue + + if last_comment is not None: + if isinstance(entry, (ast.Message, ast.Term)): + entry.comment = last_comment + if self.with_spans: + cast(ast.Span, entry.span).start = cast(ast.Span, entry.comment.span).start + else: + entries.append(last_comment) + # In either case, the stashed comment has been dealt with; + # clear it. + last_comment = None + + entries.append(entry) + + res = ast.Resource(entries) + + if self.with_spans: + res.add_span(0, ps.index) + + return res + + def parse_entry(self, source: str) -> ast.EntryType: + """Parse the first :class:`.ast.Entry` in source. + + Skip all encountered comments and start parsing at the first :class:`.ast.Message` + or :class:`.ast.Term` start. Return :class:`.ast.Junk` if the parsing is not successful. + + Preceding comments are ignored unless they contain syntax errors + themselves, in which case :class:`.ast.Junk` for the invalid comment is returned. + """ + ps = FluentParserStream(source) + ps.skip_blank_block() + + while ps.current_char == '#': + skipped = self.get_entry_or_junk(ps) + if isinstance(skipped, ast.Junk): + # Don't skip Junk comments. + return skipped + ps.skip_blank_block() + + return self.get_entry_or_junk(ps) + + def get_entry_or_junk(self, ps: FluentParserStream) -> ast.EntryType: + entry_start_pos = ps.index + + try: + entry = self.get_entry(ps) + ps.expect_line_end() + return entry + except ParseError as err: + error_index = ps.index + ps.skip_to_next_entry_start(entry_start_pos) + next_entry_start = ps.index + if next_entry_start < error_index: + # The position of the error must be inside of the Junk's span. + error_index = next_entry_start + + # Create a Junk instance + slice = ps.string[entry_start_pos:next_entry_start] + junk = ast.Junk(slice) + if self.with_spans: + junk.add_span(entry_start_pos, next_entry_start) + annot = ast.Annotation(err.code, list(err.args) if err.args else None, err.message) + annot.add_span(error_index, error_index) + junk.add_annotation(annot) + return junk + + def get_entry(self, ps: FluentParserStream) -> ast.EntryType: + if ps.current_char == '#': + return self.get_comment(ps) + + if ps.current_char == '-': + return self.get_term(ps) + + if ps.is_identifier_start(): + return self.get_message(ps) + + raise ParseError('E0002') + + @with_span + def get_comment(self, ps: FluentParserStream) -> Union[ast.Comment, ast.GroupComment, ast.ResourceComment]: + # 0 - comment + # 1 - group comment + # 2 - resource comment + level = -1 + content = '' + + while True: + i = -1 + while ps.current_char == '#' \ + and (i < (2 if level == -1 else level)): + ps.next() + i += 1 + + if level == -1: + level = i + + if ps.current_char != EOL: + ps.expect_char(' ') + ch = ps.take_char(lambda x: x != EOL) + while ch: + content += ch + ch = ps.take_char(lambda x: x != EOL) + + if ps.is_next_line_comment(level=level): + content += cast(str, ps.current_char) + ps.next() + else: + break + + if level == 0: + return ast.Comment(content) + elif level == 1: + return ast.GroupComment(content) + elif level == 2: + return ast.ResourceComment(content) + + # never happens if ps.current_char == '#' when called + return cast(ast.Comment, None) + + @with_span + def get_message(self, ps: FluentParserStream) -> ast.Message: + id = self.get_identifier(ps) + ps.skip_blank_inline() + ps.expect_char('=') + + value = self.maybe_get_pattern(ps) + attrs = self.get_attributes(ps) + + if value is None and len(attrs) == 0: + raise ParseError('E0005', id.name) + + return ast.Message(id, value, attrs) + + @with_span + def get_term(self, ps: FluentParserStream) -> ast.Term: + ps.expect_char('-') + id = self.get_identifier(ps) + + ps.skip_blank_inline() + ps.expect_char('=') + + value = self.maybe_get_pattern(ps) + if value is None: + raise ParseError('E0006', id.name) + + attrs = self.get_attributes(ps) + return ast.Term(id, value, attrs) + + @with_span + def get_attribute(self, ps: FluentParserStream) -> ast.Attribute: + ps.expect_char('.') + + key = self.get_identifier(ps) + + ps.skip_blank_inline() + ps.expect_char('=') + + value = self.maybe_get_pattern(ps) + if value is None: + raise ParseError('E0012') + + return ast.Attribute(key, value) + + def get_attributes(self, ps: FluentParserStream) -> List[ast.Attribute]: + attrs: List[ast.Attribute] = [] + ps.peek_blank() + + while ps.is_attribute_start(): + ps.skip_to_peek() + attr = self.get_attribute(ps) + attrs.append(attr) + ps.peek_blank() + + return attrs + + @with_span + def get_identifier(self, ps: FluentParserStream) -> ast.Identifier: + name = ps.take_id_start() + if name is None: + raise ParseError('E0004', 'a-zA-Z') + + ch = ps.take_id_char() + while ch: + name += ch + ch = ps.take_id_char() + + return ast.Identifier(name) + + def get_variant_key(self, ps: FluentParserStream) -> Union[ast.Identifier, ast.NumberLiteral]: + ch = ps.current_char + + if ch is None: + raise ParseError('E0013') + + cc = ord(ch) + if ((cc >= 48 and cc <= 57) or cc == 45): # 0-9, - + return self.get_number(ps) + + return self.get_identifier(ps) + + @with_span + def get_variant(self, ps: FluentParserStream, has_default: bool) -> ast.Variant: + default_index = False + + if ps.current_char == '*': + if has_default: + raise ParseError('E0015') + ps.next() + default_index = True + + ps.expect_char('[') + ps.skip_blank() + + key = self.get_variant_key(ps) + + ps.skip_blank() + ps.expect_char(']') + + value = self.maybe_get_pattern(ps) + if value is None: + raise ParseError('E0012') + + return ast.Variant(key, value, default_index) + + def get_variants(self, ps: FluentParserStream) -> List[ast.Variant]: + variants: List[ast.Variant] = [] + has_default = False + + ps.skip_blank() + while ps.is_variant_start(): + variant = self.get_variant(ps, has_default) + + if variant.default: + has_default = True + + variants.append(variant) + ps.expect_line_end() + ps.skip_blank() + + if len(variants) == 0: + raise ParseError('E0011') + + if not has_default: + raise ParseError('E0010') + + return variants + + def get_digits(self, ps: FluentParserStream) -> str: + num = '' + + ch = ps.take_digit() + while ch: + num += ch + ch = ps.take_digit() + + if len(num) == 0: + raise ParseError('E0004', '0-9') + + return num + + @with_span + def get_number(self, ps: FluentParserStream) -> ast.NumberLiteral: + num = '' + + if ps.current_char == '-': + num += '-' + ps.next() + + num += self.get_digits(ps) + + if ps.current_char == '.': + num += '.' + ps.next() + num += self.get_digits(ps) + + return ast.NumberLiteral(num) + + def maybe_get_pattern(self, ps: FluentParserStream) -> Union[ast.Pattern, None]: + '''Parse an inline or a block Pattern, or None + + maybe_get_pattern distinguishes between patterns which start on the + same line as the indentifier (aka inline singleline patterns and inline + multiline patterns), and patterns which start on a new line (aka block + patterns). The distinction is important for the dedentation logic: the + indent of the first line of a block pattern must be taken into account + when calculating the maximum common indent. + ''' + ps.peek_blank_inline() + if ps.is_value_start(): + ps.skip_to_peek() + return self.get_pattern(ps, is_block=False) + + ps.peek_blank_block() + if ps.is_value_continuation(): + ps.skip_to_peek() + return self.get_pattern(ps, is_block=True) + + return None + + @with_span + def get_pattern(self, ps: FluentParserStream, is_block: bool) -> ast.Pattern: + elements: List[Any] = [] + if is_block: + # A block pattern is a pattern which starts on a new line. Measure + # the indent of this first line for the dedentation logic. + blank_start = ps.index + first_indent = ps.skip_blank_inline() + elements.append(self.Indent(first_indent, blank_start, ps.index)) + common_indent_length = len(first_indent) + else: + # Should get fixed by the subsequent min() operation + common_indent_length = cast(int, float('infinity')) + + while ps.current_char: + if ps.current_char == EOL: + blank_start = ps.index + blank_lines = ps.peek_blank_block() + if ps.is_value_continuation(): + ps.skip_to_peek() + indent = ps.skip_blank_inline() + common_indent_length = min(common_indent_length, len(indent)) + elements.append(self.Indent(blank_lines + indent, blank_start, ps.index)) + continue + + # The end condition for get_pattern's while loop is a newline + # which is not followed by a valid pattern continuation. + ps.reset_peek() + break + + if ps.current_char == '}': + raise ParseError('E0027') + + element: Union[ast.TextElement, ast.Placeable] + if ps.current_char == '{': + element = self.get_placeable(ps) + else: + element = self.get_text_element(ps) + + elements.append(element) + + dedented = self.dedent(elements, common_indent_length) + return ast.Pattern(dedented) + + class Indent(ast.SyntaxNode): + def __init__(self, value: str, start: int, end: int): + super(FluentParser.Indent, self).__init__() + self.value = value + self.add_span(start, end) + + def dedent(self, + elements: List[Union[ast.TextElement, ast.Placeable, Indent]], + common_indent: int + ) -> List[Union[ast.TextElement, ast.Placeable]]: + '''Dedent a list of elements by removing the maximum common indent from + the beginning of text lines. The common indent is calculated in + get_pattern. + ''' + trimmed: List[Union[ast.TextElement, ast.Placeable]] = [] + + for element in elements: + if isinstance(element, ast.Placeable): + trimmed.append(element) + continue + + if isinstance(element, self.Indent): + # Strip the common indent. + element.value = element.value[:len(element.value) - common_indent] + if len(element.value) == 0: + continue + + prev = trimmed[-1] if len(trimmed) > 0 else None + if isinstance(prev, ast.TextElement): + # Join adjacent TextElements by replacing them with their sum. + sum = ast.TextElement(prev.value + element.value) + if self.with_spans: + sum.add_span(cast(ast.Span, prev.span).start, cast(ast.Span, element.span).end) + trimmed[-1] = sum + continue + + if isinstance(element, self.Indent): + # If the indent hasn't been merged into a preceding + # TextElements, convert it into a new TextElement. + text_element = ast.TextElement(element.value) + if self.with_spans: + text_element.add_span(cast(ast.Span, element.span).start, cast(ast.Span, element.span).end) + element = text_element + + trimmed.append(element) + + # Trim trailing whitespace from the Pattern. + last_element = trimmed[-1] if len(trimmed) > 0 else None + if isinstance(last_element, ast.TextElement): + last_element.value = last_element.value.rstrip(' \n\r') + if last_element.value == "": + trimmed.pop() + + return trimmed + + @with_span + def get_text_element(self, ps: FluentParserStream) -> ast.TextElement: + buf = '' + + while ps.current_char: + ch = ps.current_char + + if ch == '{' or ch == '}': + return ast.TextElement(buf) + + if ch == EOL: + return ast.TextElement(buf) + + buf += ch + ps.next() + + return ast.TextElement(buf) + + def get_escape_sequence(self, ps: FluentParserStream) -> str: + next = ps.current_char + + if next == '\\' or next == '"': + ps.next() + return f'\\{next}' + + if next == 'u': + return self.get_unicode_escape_sequence(ps, next, 4) + + if next == 'U': + return self.get_unicode_escape_sequence(ps, next, 6) + + raise ParseError('E0025', next) + + def get_unicode_escape_sequence(self, ps: FluentParserStream, u: str, digits: int) -> str: + ps.expect_char(u) + sequence = '' + for _ in range(digits): + ch = ps.take_hex_digit() + if not ch: + raise ParseError('E0026', f'\\{u}{sequence}{ps.current_char}') + sequence += ch + + return f'\\{u}{sequence}' + + @with_span + def get_placeable(self, ps: FluentParserStream) -> ast.Placeable: + ps.expect_char('{') + ps.skip_blank() + expression = self.get_expression(ps) + ps.expect_char('}') + return ast.Placeable(expression) + + @with_span + def get_expression(self, ps: FluentParserStream) -> Union[ast.InlineExpression, + ast.Placeable, + ast.SelectExpression]: + selector = self.get_inline_expression(ps) + + ps.skip_blank() + + if ps.current_char == '-': + if ps.peek() != '>': + ps.reset_peek() + return selector + + if isinstance(selector, ast.MessageReference): + if selector.attribute is None: + raise ParseError('E0016') + else: + raise ParseError('E0018') + + elif ( + isinstance(selector, ast.TermReference) + ): + if selector.attribute is None: + raise ParseError('E0017') + elif not ( + isinstance(selector, ( + ast.StringLiteral, + ast.NumberLiteral, + ast.VariableReference, + ast.FunctionReference, + )) + ): + raise ParseError('E0029') + + ps.next() + ps.next() + + ps.skip_blank_inline() + ps.expect_line_end() + + variants = self.get_variants(ps) + return ast.SelectExpression(selector, variants) + + if ( + isinstance(selector, ast.TermReference) + and selector.attribute is not None + ): + raise ParseError('E0019') + + return selector + + @with_span + def get_inline_expression(self, ps: FluentParserStream) -> Union[ast.InlineExpression, ast.Placeable]: + if ps.current_char == '{': + return self.get_placeable(ps) + + if ps.is_number_start(): + return self.get_number(ps) + + if ps.current_char == '"': + return self.get_string(ps) + + if ps.current_char == '$': + ps.next() + id = self.get_identifier(ps) + return ast.VariableReference(id) + + if ps.current_char == '-': + ps.next() + id = self.get_identifier(ps) + attribute = None + if ps.current_char == '.': + ps.next() + attribute = self.get_identifier(ps) + arguments = None + ps.peek_blank() + if ps.current_peek == '(': + ps.skip_to_peek() + arguments = self.get_call_arguments(ps) + return ast.TermReference(id, attribute, arguments) + + if ps.is_identifier_start(): + id = self.get_identifier(ps) + ps.peek_blank() + + if ps.current_peek == '(': + # It's a Function. Ensure it's all upper-case. + if not re.match('^[A-Z][A-Z0-9_-]*$', id.name): + raise ParseError('E0008') + ps.skip_to_peek() + args = self.get_call_arguments(ps) + return ast.FunctionReference(id, args) + + attribute = None + if ps.current_char == '.': + ps.next() + attribute = self.get_identifier(ps) + + return ast.MessageReference(id, attribute) + + raise ParseError('E0028') + + @with_span + def get_call_argument(self, + ps: FluentParserStream + ) -> Union[ast.InlineExpression, ast.NamedArgument, ast.Placeable]: + exp = self.get_inline_expression(ps) + + ps.skip_blank() + + if ps.current_char != ':': + return exp + + if isinstance(exp, ast.MessageReference) and exp.attribute is None: + ps.next() + ps.skip_blank() + + value = self.get_literal(ps) + return ast.NamedArgument(exp.id, value) + + raise ParseError('E0009') + + @with_span + def get_call_arguments(self, ps: FluentParserStream) -> ast.CallArguments: + positional: List[Union[ast.InlineExpression, ast.Placeable]] = [] + named: List[ast.NamedArgument] = [] + argument_names: Set[str] = set() + + ps.expect_char('(') + ps.skip_blank() + + while True: + if ps.current_char == ')': + break + + arg = self.get_call_argument(ps) + if isinstance(arg, ast.NamedArgument): + if arg.name.name in argument_names: + raise ParseError('E0022') + named.append(arg) + argument_names.add(arg.name.name) + elif len(argument_names) > 0: + raise ParseError('E0021') + else: + positional.append(arg) + + ps.skip_blank() + + if ps.current_char == ',': + ps.next() + ps.skip_blank() + continue + + break + + ps.expect_char(')') + return ast.CallArguments(positional, named) + + @with_span + def get_string(self, ps: FluentParserStream) -> ast.StringLiteral: + value = '' + + ps.expect_char('"') + + while True: + ch = ps.take_char(lambda x: x != '"' and x != EOL) + if not ch: + break + if ch == '\\': + value += self.get_escape_sequence(ps) + else: + value += ch + + if ps.current_char == EOL: + raise ParseError('E0020') + + ps.expect_char('"') + + return ast.StringLiteral(value) + + @with_span + def get_literal(self, ps: FluentParserStream) -> Union[ast.NumberLiteral, ast.StringLiteral]: + if ps.is_number_start(): + return self.get_number(ps) + if ps.current_char == '"': + return self.get_string(ps) + raise ParseError('E0014') |