"""Handwritten parser of dependency specifiers. The docstring for each __parse_* function contains ENBF-inspired grammar representing the implementation. """ import ast from typing import Any, List, NamedTuple, Optional, Tuple, Union from ._tokenizer import DEFAULT_RULES, Tokenizer class Node: def __init__(self, value: str) -> None: self.value = value def __str__(self) -> str: return self.value def __repr__(self) -> str: return f"<{self.__class__.__name__}('{self}')>" def serialize(self) -> str: raise NotImplementedError class Variable(Node): def serialize(self) -> str: return str(self) class Value(Node): def serialize(self) -> str: return f'"{self}"' class Op(Node): def serialize(self) -> str: return str(self) MarkerVar = Union[Variable, Value] MarkerItem = Tuple[MarkerVar, Op, MarkerVar] # MarkerAtom = Union[MarkerItem, List["MarkerAtom"]] # MarkerList = List[Union["MarkerList", MarkerAtom, str]] # mypy does not support recursive type definition # https://github.com/python/mypy/issues/731 MarkerAtom = Any MarkerList = List[Any] class ParsedRequirement(NamedTuple): name: str url: str extras: List[str] specifier: str marker: Optional[MarkerList] # -------------------------------------------------------------------------------------- # Recursive descent parser for dependency specifier # -------------------------------------------------------------------------------------- def parse_requirement(source: str) -> ParsedRequirement: return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES)) def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement: """ requirement = WS? IDENTIFIER WS? extras WS? requirement_details """ tokenizer.consume("WS") name_token = tokenizer.expect( "IDENTIFIER", expected="package name at the start of dependency specifier" ) name = name_token.text tokenizer.consume("WS") extras = _parse_extras(tokenizer) tokenizer.consume("WS") url, specifier, marker = _parse_requirement_details(tokenizer) tokenizer.expect("END", expected="end of dependency specifier") return ParsedRequirement(name, url, extras, specifier, marker) def _parse_requirement_details( tokenizer: Tokenizer, ) -> Tuple[str, str, Optional[MarkerList]]: """ requirement_details = AT URL (WS requirement_marker?)? | specifier WS? (requirement_marker)? """ specifier = "" url = "" marker = None if tokenizer.check("AT"): tokenizer.read() tokenizer.consume("WS") url_start = tokenizer.position url = tokenizer.expect("URL", expected="URL after @").text if tokenizer.check("END", peek=True): return (url, specifier, marker) tokenizer.expect("WS", expected="whitespace after URL") # The input might end after whitespace. if tokenizer.check("END", peek=True): return (url, specifier, marker) marker = _parse_requirement_marker( tokenizer, span_start=url_start, after="URL and whitespace" ) else: specifier_start = tokenizer.position specifier = _parse_specifier(tokenizer) tokenizer.consume("WS") if tokenizer.check("END", peek=True): return (url, specifier, marker) marker = _parse_requirement_marker( tokenizer, span_start=specifier_start, after=( "version specifier" if specifier else "name and no valid version specifier" ), ) return (url, specifier, marker) def _parse_requirement_marker( tokenizer: Tokenizer, *, span_start: int, after: str ) -> MarkerList: """ requirement_marker = SEMICOLON marker WS? """ if not tokenizer.check("SEMICOLON"): tokenizer.raise_syntax_error( f"Expected end or semicolon (after {after})", span_start=span_start, ) tokenizer.read() marker = _parse_marker(tokenizer) tokenizer.consume("WS") return marker def _parse_extras(tokenizer: Tokenizer) -> List[str]: """ extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)? """ if not tokenizer.check("LEFT_BRACKET", peek=True): return [] with tokenizer.enclosing_tokens( "LEFT_BRACKET", "RIGHT_BRACKET", around="extras", ): tokenizer.consume("WS") extras = _parse_extras_list(tokenizer) tokenizer.consume("WS") return extras def _parse_extras_list(tokenizer: Tokenizer) -> List[str]: """ extras_list = identifier (wsp* ',' wsp* identifier)* """ extras: List[str] = [] if not tokenizer.check("IDENTIFIER"): return extras extras.append(tokenizer.read().text) while True: tokenizer.consume("WS") if tokenizer.check("IDENTIFIER", peek=True): tokenizer.raise_syntax_error("Expected comma between extra names") elif not tokenizer.check("COMMA"): break tokenizer.read() tokenizer.consume("WS") extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma") extras.append(extra_token.text) return extras def _parse_specifier(tokenizer: Tokenizer) -> str: """ specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS | WS? version_many WS? """ with tokenizer.enclosing_tokens( "LEFT_PARENTHESIS", "RIGHT_PARENTHESIS", around="version specifier", ): tokenizer.consume("WS") parsed_specifiers = _parse_version_many(tokenizer) tokenizer.consume("WS") return parsed_specifiers def _parse_version_many(tokenizer: Tokenizer) -> str: """ version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)? """ parsed_specifiers = "" while tokenizer.check("SPECIFIER"): span_start = tokenizer.position parsed_specifiers += tokenizer.read().text if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True): tokenizer.raise_syntax_error( ".* suffix can only be used with `==` or `!=` operators", span_start=span_start, span_end=tokenizer.position + 1, ) if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True): tokenizer.raise_syntax_error( "Local version label can only be used with `==` or `!=` operators", span_start=span_start, span_end=tokenizer.position, ) tokenizer.consume("WS") if not tokenizer.check("COMMA"): break parsed_specifiers += tokenizer.read().text tokenizer.consume("WS") return parsed_specifiers # -------------------------------------------------------------------------------------- # Recursive descent parser for marker expression # -------------------------------------------------------------------------------------- def parse_marker(source: str) -> MarkerList: return _parse_marker(Tokenizer(source, rules=DEFAULT_RULES)) def _parse_marker(tokenizer: Tokenizer) -> MarkerList: """ marker = marker_atom (BOOLOP marker_atom)+ """ expression = [_parse_marker_atom(tokenizer)] while tokenizer.check("BOOLOP"): token = tokenizer.read() expr_right = _parse_marker_atom(tokenizer) expression.extend((token.text, expr_right)) return expression def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom: """ marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS? | WS? marker_item WS? """ tokenizer.consume("WS") if tokenizer.check("LEFT_PARENTHESIS", peek=True): with tokenizer.enclosing_tokens( "LEFT_PARENTHESIS", "RIGHT_PARENTHESIS", around="marker expression", ): tokenizer.consume("WS") marker: MarkerAtom = _parse_marker(tokenizer) tokenizer.consume("WS") else: marker = _parse_marker_item(tokenizer) tokenizer.consume("WS") return marker def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem: """ marker_item = WS? marker_var WS? marker_op WS? marker_var WS? """ tokenizer.consume("WS") marker_var_left = _parse_marker_var(tokenizer) tokenizer.consume("WS") marker_op = _parse_marker_op(tokenizer) tokenizer.consume("WS") marker_var_right = _parse_marker_var(tokenizer) tokenizer.consume("WS") return (marker_var_left, marker_op, marker_var_right) def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar: """ marker_var = VARIABLE | QUOTED_STRING """ if tokenizer.check("VARIABLE"): return process_env_var(tokenizer.read().text.replace(".", "_")) elif tokenizer.check("QUOTED_STRING"): return process_python_str(tokenizer.read().text) else: tokenizer.raise_syntax_error( message="Expected a marker variable or quoted string" ) def process_env_var(env_var: str) -> Variable: if ( env_var == "platform_python_implementation" or env_var == "python_implementation" ): return Variable("platform_python_implementation") else: return Variable(env_var) def process_python_str(python_str: str) -> Value: value = ast.literal_eval(python_str) return Value(str(value)) def _parse_marker_op(tokenizer: Tokenizer) -> Op: """ marker_op = IN | NOT IN | OP """ if tokenizer.check("IN"): tokenizer.read() return Op("in") elif tokenizer.check("NOT"): tokenizer.read() tokenizer.expect("WS", expected="whitespace after 'not'") tokenizer.expect("IN", expected="'in' after 'not'") return Op("not in") elif tokenizer.check("OP"): return Op(tokenizer.read().text) else: return tokenizer.raise_syntax_error( "Expected marker operator, one of " "<=, <, !=, ==, >=, >, ~=, ===, in, not in" )