summaryrefslogtreecommitdiffstats
path: root/src/debputy/lsp/vendoring/_deb822_repro/tokens.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/debputy/lsp/vendoring/_deb822_repro/tokens.py')
-rw-r--r--src/debputy/lsp/vendoring/_deb822_repro/tokens.py516
1 files changed, 516 insertions, 0 deletions
diff --git a/src/debputy/lsp/vendoring/_deb822_repro/tokens.py b/src/debputy/lsp/vendoring/_deb822_repro/tokens.py
new file mode 100644
index 0000000..4e5fa16
--- /dev/null
+++ b/src/debputy/lsp/vendoring/_deb822_repro/tokens.py
@@ -0,0 +1,516 @@
+import re
+import sys
+import weakref
+from weakref import ReferenceType
+
+from ._util import BufferingIterator
+from .locatable import (
+ Locatable,
+ START_POSITION,
+ Range,
+ ONE_CHAR_RANGE,
+ ONE_LINE_RANGE,
+ Position,
+)
+from debian._util import resolve_ref, _strI
+
+try:
+ from typing import Optional, cast, TYPE_CHECKING, Iterable, Union, Dict, Callable
+except ImportError:
+ # pylint: disable=unnecessary-lambda-assignment
+ TYPE_CHECKING = False
+ cast = lambda t, v: v
+
+if TYPE_CHECKING:
+ from .parsing import Deb822Element
+
+
+# Consume whitespace and a single word.
+_RE_WHITESPACE_SEPARATED_WORD_LIST = re.compile(
+ r"""
+ (?P<space_before>\s*) # Consume any whitespace before the word
+ # The space only occurs in practise if the line starts
+ # with space.
+
+ # Optionally consume a word (needed to handle the case
+ # when there are no words left and someone applies this
+ # pattern to the remaining text). This is mostly here as
+ # a fail-safe.
+
+ (?P<word>\S+) # Consume the word (if present)
+ (?P<trailing_whitespace>\s*) # Consume trailing whitespace
+""",
+ re.VERBOSE,
+)
+_RE_COMMA_SEPARATED_WORD_LIST = re.compile(
+ r"""
+ # This regex is slightly complicated by the fact that it should work with
+ # finditer and comsume the entire value.
+ #
+ # To do this, we structure the regex so it always starts on a comma (except
+ # for the first iteration, where we permit the absence of a comma)
+
+ (?: # Optional space followed by a mandatory comma unless
+ # it is the start of the "line" (in which case, we
+ # allow the comma to be omitted)
+ ^
+ |
+ (?:
+ (?P<space_before_comma>\s*) # This space only occurs in practise if the line
+ # starts with space + comma.
+ (?P<comma> ,)
+ )
+ )
+
+ # From here it is "optional space, maybe a word and then optional space" again. One reason why
+ # all of it is optional is to gracefully cope with trailing commas.
+ (?P<space_before_word>\s*)
+ (?P<word> [^,\s] (?: [^,]*[^,\s])? )? # "Words" can contain spaces for comma separated list.
+ # But surrounding whitespace is ignored
+ (?P<space_after_word>\s*)
+""",
+ re.VERBOSE,
+)
+
+# From Policy 5.1:
+#
+# The field name is composed of US-ASCII characters excluding control
+# characters, space, and colon (i.e., characters in the ranges U+0021
+# (!) through U+0039 (9), and U+003B (;) through U+007E (~),
+# inclusive). Field names must not begin with the comment character
+# (U+0023 #), nor with the hyphen character (U+002D -).
+#
+# That combines to this regex of questionable readability
+_RE_FIELD_LINE = re.compile(
+ r"""
+ ^ # Start of line
+ (?P<field_name> # Capture group for the field name
+ [\x21\x22\x24-\x2C\x2F-\x39\x3B-\x7F] # First character
+ [\x21-\x39\x3B-\x7F]* # Subsequent characters (if any)
+ )
+ (?P<separator> : )
+ (?P<space_before_value> \s* )
+ (?: # Field values are not mandatory on the same line
+ # as the field name.
+
+ (?P<value> \S(?:.*\S)? ) # Values must start and end on a "non-space"
+ (?P<space_after_value> \s* ) # We can have optional space after the value
+ )?
+""",
+ re.VERBOSE,
+)
+
+
+class Deb822Token(Locatable):
+ """A token is an atomic syntactical element from a deb822 file
+
+ A file is parsed into a series of tokens. If these tokens are converted to
+ text in exactly the same order, you get exactly the same file - bit-for-bit.
+ Accordingly ever bit of text in a file must be assigned to exactly one
+ Deb822Token.
+ """
+
+ __slots__ = ("_text", "_parent_element", "_token_size", "__weakref__")
+
+ def __init__(self, text):
+ # type: (str) -> None
+ if text == "": # pragma: no cover
+ raise ValueError("Tokens must have content")
+ self._text = text # type: str
+ self._parent_element = None # type: Optional[ReferenceType['Deb822Element']]
+ self._token_size = None # type: Optional[Range]
+ self._verify_token_text()
+
+ def __repr__(self):
+ # type: () -> str
+ return "{clsname}('{text}')".format(
+ clsname=self.__class__.__name__, text=self._text.replace("\n", "\\n")
+ )
+
+ def _verify_token_text(self):
+ # type: () -> None
+ if "\n" in self._text:
+ is_single_line_token = False
+ if self.is_comment or self.is_error:
+ is_single_line_token = True
+ if not is_single_line_token and not self.is_whitespace:
+ raise ValueError(
+ "Only whitespace, error and comment tokens may contain newlines"
+ )
+ if not self.text.endswith("\n"):
+ raise ValueError("Tokens containing whitespace must end on a newline")
+ if is_single_line_token and "\n" in self.text[:-1]:
+ raise ValueError(
+ "Comments and error tokens must not contain embedded newlines"
+ " (only end on one)"
+ )
+
+ @property
+ def is_whitespace(self):
+ # type: () -> bool
+ return False
+
+ @property
+ def is_comment(self):
+ # type: () -> bool
+ return False
+
+ @property
+ def is_error(self):
+ # type: () -> bool
+ return False
+
+ @property
+ def text(self):
+ # type: () -> str
+ return self._text
+
+ # To support callers that want a simple interface for converting tokens and elements to text
+ def convert_to_text(self):
+ # type: () -> str
+ return self._text
+
+ def size(self, *, skip_leading_comments: bool = False) -> Range:
+ # As tokens are an atomtic unit
+ token_size = self._token_size
+ if token_size is not None:
+ return token_size
+ token_len = len(self._text)
+ if token_len == 1:
+ # The indirection with `r` because mypy gets confused and thinks that `token_size`
+ # cannot have any type at all.
+ token_size = ONE_CHAR_RANGE if self._text != "\n" else ONE_LINE_RANGE
+ else:
+ new_lines = self._text.count("\n")
+ assert not new_lines or self._text[-1] == "\n"
+ end_pos = Position(new_lines, 0) if new_lines else Position(0, token_len)
+ token_size = Range(START_POSITION, end_pos)
+ self._token_size = token_size
+ return token_size
+
+ @property
+ def parent_element(self):
+ # type: () -> Optional[Deb822Element]
+ return resolve_ref(self._parent_element)
+
+ @parent_element.setter
+ def parent_element(self, new_parent):
+ # type: (Optional[Deb822Element]) -> None
+ self._parent_element = (
+ weakref.ref(new_parent) if new_parent is not None else None
+ )
+
+ def clear_parent_if_parent(self, parent):
+ # type: (Deb822Element) -> None
+ if parent is self.parent_element:
+ self._parent_element = None
+
+
+class Deb822WhitespaceToken(Deb822Token):
+ """The token is a kind of whitespace.
+
+ Some whitespace tokens are critical for the format (such as the Deb822ValueContinuationToken,
+ spaces that separate words in list separated by spaces or newlines), while other whitespace
+ tokens are truly insignificant (space before a newline, space after a comma in a comma
+ list, etc.).
+ """
+
+ __slots__ = ()
+
+ @property
+ def is_whitespace(self):
+ # type: () -> bool
+ return True
+
+
+class Deb822SemanticallySignificantWhiteSpace(Deb822WhitespaceToken):
+ """Whitespace that (if removed) would change the meaning of the file (or cause syntax errors)"""
+
+ __slots__ = ()
+
+
+class Deb822NewlineAfterValueToken(Deb822SemanticallySignificantWhiteSpace):
+ """The newline after a value token.
+
+ If not followed by a continuation token, this also marks the end of the field.
+ """
+
+ __slots__ = ()
+
+ def __init__(self):
+ # type: () -> None
+ super().__init__("\n")
+
+
+class Deb822ValueContinuationToken(Deb822SemanticallySignificantWhiteSpace):
+ """The whitespace denoting a value spanning an additional line (the first space on a line)"""
+
+ __slots__ = ()
+
+
+class Deb822SpaceSeparatorToken(Deb822SemanticallySignificantWhiteSpace):
+ """Whitespace between values in a space list (e.g. "Architectures")"""
+
+ __slots__ = ()
+
+
+class Deb822ErrorToken(Deb822Token):
+ """Token that represents a syntactical error"""
+
+ __slots__ = ()
+
+ @property
+ def is_error(self):
+ # type: () -> bool
+ return True
+
+
+class Deb822CommentToken(Deb822Token):
+
+ __slots__ = ()
+
+ @property
+ def is_comment(self):
+ # type: () -> bool
+ return True
+
+
+class Deb822FieldNameToken(Deb822Token):
+
+ __slots__ = ()
+
+ def __init__(self, text):
+ # type: (str) -> None
+ if not isinstance(text, _strI):
+ text = _strI(sys.intern(text))
+ super().__init__(text)
+
+ @property
+ def text(self):
+ # type: () -> _strI
+ return cast("_strI", self._text)
+
+
+# The colon after the field name, parenthesis, etc.
+class Deb822SeparatorToken(Deb822Token):
+
+ __slots__ = ()
+
+
+class Deb822FieldSeparatorToken(Deb822Token):
+
+ __slots__ = ()
+
+ def __init__(self):
+ # type: () -> None
+ super().__init__(":")
+
+
+class Deb822CommaToken(Deb822SeparatorToken):
+ """Used by the comma-separated list value parsers to denote a comma between two value tokens."""
+
+ __slots__ = ()
+
+ def __init__(self):
+ # type: () -> None
+ super().__init__(",")
+
+
+class Deb822PipeToken(Deb822SeparatorToken):
+ """Used in some dependency fields as OR relation"""
+
+ __slots__ = ()
+
+ def __init__(self):
+ # type: () -> None
+ super().__init__("|")
+
+
+class Deb822ValueToken(Deb822Token):
+ """A field value can be split into multi "Deb822ValueToken"s (as well as separator tokens)"""
+
+ __slots__ = ()
+
+
+class Deb822ValueDependencyToken(Deb822Token):
+ """Package name, architecture name, a version number, or a profile name in a dependency field"""
+
+ __slots__ = ()
+
+
+class Deb822ValueDependencyVersionRelationOperatorToken(Deb822Token):
+
+ __slots__ = ()
+
+
+def tokenize_deb822_file(sequence, encoding="utf-8"):
+ # type: (Iterable[Union[str, bytes]], str) -> Iterable[Deb822Token]
+ """Tokenize a deb822 file
+
+ :param sequence: An iterable of lines (a file open for reading will do)
+ :param encoding: The encoding to use (this is here to support Deb822-like
+ APIs, new code should not use this parameter).
+ """
+ current_field_name = None
+ field_name_cache = {} # type: Dict[str, _strI]
+
+ def _normalize_input(s):
+ # type: (Iterable[Union[str, bytes]]) -> Iterable[str]
+ for x in s:
+ if isinstance(x, bytes):
+ x = x.decode(encoding)
+ if not x.endswith("\n"):
+ # We always end on a newline because it makes a lot of code simpler. The pain
+ # points relates to mutations that add content after the last field. Sadly, these
+ # mutations can happen via adding fields, reordering fields, etc. and are too hard
+ # to track to make it worth it to support the special case that makes up missing
+ # a newline at the end of the file.
+ x += "\n"
+ yield x
+
+ text_stream = BufferingIterator(
+ _normalize_input(sequence)
+ ) # type: BufferingIterator[str]
+
+ for line in text_stream:
+ if line.isspace():
+ if current_field_name:
+ # Blank lines terminate fields
+ current_field_name = None
+
+ # If there are multiple whitespace-only lines, we combine them
+ # into one token.
+ r = list(text_stream.takewhile(str.isspace))
+ if r:
+ line += "".join(r)
+
+ # whitespace tokens are likely to have duplicate cases (like
+ # single newline tokens), so we intern the strings there.
+ yield Deb822WhitespaceToken(sys.intern(line))
+ continue
+
+ if line[0] == "#":
+ yield Deb822CommentToken(line)
+ continue
+
+ if line[0] in (" ", "\t"):
+ if current_field_name is not None:
+ # We emit a separate whitespace token for the newline as it makes some
+ # things easier later (see _build_value_line)
+ leading = sys.intern(line[0])
+ # Pull out the leading space and newline
+ line = line[1:-1]
+ yield Deb822ValueContinuationToken(leading)
+ yield Deb822ValueToken(line)
+ yield Deb822NewlineAfterValueToken()
+ else:
+ yield Deb822ErrorToken(line)
+ continue
+
+ field_line_match = _RE_FIELD_LINE.match(line)
+ if field_line_match:
+ # The line is a field, which means there is a bit to unpack
+ # - note that by definition, leading and trailing whitespace is insignificant
+ # on the value part directly after the field separator
+ (field_name, _, space_before, value, space_after) = (
+ field_line_match.groups()
+ )
+
+ current_field_name = field_name_cache.get(field_name)
+
+ if value is None or value == "":
+ # If there is no value, then merge the two space elements into space_after
+ # as it makes it easier to handle the newline.
+ space_after = (
+ space_before + space_after if space_after else space_before
+ )
+ space_before = ""
+
+ if space_after:
+ # We emit a separate whitespace token for the newline as it makes some
+ # things easier later (see _build_value_line)
+ if space_after.endswith("\n"):
+ space_after = space_after[:-1]
+
+ if current_field_name is None:
+ field_name = sys.intern(field_name)
+ current_field_name = _strI(field_name)
+ field_name_cache[field_name] = current_field_name
+
+ # We use current_field_name from here as it is a _strI.
+ # Delete field_name to avoid accidentally using it and getting bugs
+ # that should not happen.
+ del field_name
+
+ yield Deb822FieldNameToken(current_field_name)
+ yield Deb822FieldSeparatorToken()
+ if space_before:
+ yield Deb822WhitespaceToken(sys.intern(space_before))
+ if value:
+ yield Deb822ValueToken(value)
+ if space_after:
+ yield Deb822WhitespaceToken(sys.intern(space_after))
+ yield Deb822NewlineAfterValueToken()
+ else:
+ yield Deb822ErrorToken(line)
+
+
+def _value_line_tokenizer(func):
+ # type: (Callable[[str], Iterable[Deb822Token]]) -> (Callable[[str], Iterable[Deb822Token]])
+ def impl(v):
+ # type: (str) -> Iterable[Deb822Token]
+ first_line = True
+ for no, line in enumerate(v.splitlines(keepends=True)):
+ assert not v.isspace() or no == 0
+ if line.startswith("#"):
+ yield Deb822CommentToken(line)
+ continue
+ has_newline = False
+ continuation_line_marker = None
+ if not first_line:
+ continuation_line_marker = line[0]
+ line = line[1:]
+ first_line = False
+ if line.endswith("\n"):
+ has_newline = True
+ line = line[:-1]
+ if continuation_line_marker is not None:
+ yield Deb822ValueContinuationToken(sys.intern(continuation_line_marker))
+ yield from func(line)
+ if has_newline:
+ yield Deb822NewlineAfterValueToken()
+
+ return impl
+
+
+@_value_line_tokenizer
+def whitespace_split_tokenizer(v):
+ # type: (str) -> Iterable[Deb822Token]
+ assert "\n" not in v
+ for match in _RE_WHITESPACE_SEPARATED_WORD_LIST.finditer(v):
+ space_before, word, space_after = match.groups()
+ if space_before:
+ yield Deb822SpaceSeparatorToken(sys.intern(space_before))
+ yield Deb822ValueToken(word)
+ if space_after:
+ yield Deb822SpaceSeparatorToken(sys.intern(space_after))
+
+
+@_value_line_tokenizer
+def comma_split_tokenizer(v):
+ # type: (str) -> Iterable[Deb822Token]
+ assert "\n" not in v
+ for match in _RE_COMMA_SEPARATED_WORD_LIST.finditer(v):
+ space_before_comma, comma, space_before_word, word, space_after_word = (
+ match.groups()
+ )
+ if space_before_comma:
+ yield Deb822WhitespaceToken(sys.intern(space_before_comma))
+ if comma:
+ yield Deb822CommaToken()
+ if space_before_word:
+ yield Deb822WhitespaceToken(sys.intern(space_before_word))
+ if word:
+ yield Deb822ValueToken(word)
+ if space_after_word:
+ yield Deb822WhitespaceToken(sys.intern(space_after_word))