1 files changed, 478 insertions, 0 deletions
diff --git a/src/debputy/lsp/vendoring/_deb822_repro/formatter.py b/src/debputy/lsp/vendoring/_deb822_repro/formatter.py
new file mode 100644
index 0000000..a2b797b
--- /dev/null
+++ b/src/debputy/lsp/vendoring/_deb822_repro/formatter.py
@@ -0,0 +1,478 @@
+import operator
+
+from ._util import BufferingIterator
+from .tokens import Deb822Token
+
+# Consider these "opaque" enum-like values.  The actual value was chosen to
+# make repr easier to implement, but they are subject to change.
+_CONTENT_TYPE_VALUE = "is_value"
+_CONTENT_TYPE_COMMENT = "is_comment"
+_CONTENT_TYPE_SEPARATOR = "is_separator"
+
+try:
+    from typing import Iterator, Union, Literal
+    from .types import TokenOrElement, FormatterCallback
+except ImportError:
+    pass
+
+
+class FormatterContentToken(object):
+    """Typed, tagged text for use with the formatting API
+
+    The FormatterContentToken is used by the formatting API and provides the
+    formatter callback with context about the textual tokens it is supposed
+    to format.
+    """
+
+    __slots__ = ("_text", "_content_type")
+
+    def __init__(self, text, content_type):
+        # type: (str, object) -> None
+        self._text = text
+        self._content_type = content_type
+
+    @classmethod
+    def from_token_or_element(cls, token_or_element):
+        # type: (TokenOrElement) -> FormatterContentToken
+        if isinstance(token_or_element, Deb822Token):
+            if token_or_element.is_comment:
+                return cls.comment_token(token_or_element.text)
+            if token_or_element.is_whitespace:
+                raise ValueError("FormatterContentType cannot be whitespace")
+            return cls.value_token(token_or_element.text)
+        # Elements are assumed to be content (this is specialized for the
+        # interpretations where comments are always tokens).
+        return cls.value_token(token_or_element.convert_to_text())
+
+    @classmethod
+    def separator_token(cls, text):
+        # type: (str) -> FormatterContentToken
+        # Special-case separators as a minor memory optimization
+        if text == " ":
+            return SPACE_SEPARATOR_FT
+        if text == ",":
+            return COMMA_SEPARATOR_FT
+        return cls(text, _CONTENT_TYPE_SEPARATOR)
+
+    @classmethod
+    def comment_token(cls, text):
+        # type: (str) -> FormatterContentToken
+        """Generates a single comment token with the provided text
+
+        Mostly useful for creating test cases
+        """
+        return cls(text, _CONTENT_TYPE_COMMENT)
+
+    @classmethod
+    def value_token(cls, text):
+        # type: (str) -> FormatterContentToken
+        """Generates a single value token with the provided text
+
+        Mostly useful for creating test cases
+        """
+        return cls(text, _CONTENT_TYPE_VALUE)
+
+    @property
+    def is_comment(self):
+        # type: () -> bool
+        """True if this formatter token represent a comment
+
+        This should be used for determining whether the token is a comment
+        or not. It might be tempting to check whether the text in the token
+        starts with a "#" but that is insufficient because a value *can*
+        start with that as well.  Whether it is a comment or a value is
+        based on the context (it is a comment if and only if the "#" was
+        at the start of a line) but the formatter often do not have the
+        context available to assert this.
+
+        The formatter *should* preserve the order of comments and interleave
+        between the value tokens in the same order as it see them.  Failing
+        to preserve the order of comments and values can cause confusing
+        comments (such as associating the comment with a different value
+        than it was written for).
+
+        The formatter *may* discard comment tokens if it does not want to
+        preserve them.  If so, they would be omitted in the output, which
+        may be acceptable in some cases.  This is a lot better than
+        re-ordering comments.
+
+        Formatters must be aware of the following special cases for comments:
+         * Comments *MUST* be emitted after a newline.  If the very first token
+           is a comment, the formatter is expected to emit a newline before it
+           as well (Fields cannot start immediately on a comment).
+        """
+        return self._content_type is _CONTENT_TYPE_COMMENT
+
+    @property
+    def is_value(self):
+        # type: () -> bool
+        """True if this formatter token represents a semantic value
+
+        The formatter *MUST* preserve values as-in in its output.  It may
+        "unpack" it from the token (as in, return it as a part of a plain
+        str) but the value content must not be changed nor re-ordered relative
+        to other value tokens (as that could change the meaning of the field).
+        """
+        return self._content_type is _CONTENT_TYPE_VALUE
+
+    @property
+    def is_separator(self):
+        # type: () -> bool
+        """True if this formatter token represents a separator token
+
+        The formatter is not required to preserve the provided separators but it
+        is required to properly separate values.  In fact, often is a lot easier
+        to discard existing separator tokens.  As an example, in whitespace
+        separated list of values space, tab and newline all counts as separator.
+        However, formatting-wise, there is a world of difference between the
+        a space, tab and a newline. In particularly, newlines must be followed
+        by an additional space or tab (to act as a value continuation line) if
+        there is a value following it (otherwise, the generated output is
+        invalid).
+        """
+        return self._content_type is _CONTENT_TYPE_SEPARATOR
+
+    @property
+    def is_whitespace(self):
+        # type: () -> bool
+        """True if this formatter token represents a whitespace token"""
+        return self._content_type is _CONTENT_TYPE_SEPARATOR and self._text.isspace()
+
+    @property
+    def text(self):
+        # type: () -> str
+        """The actual context of the token
+
+        This field *must not* be used to determine the type of token.  The
+        formatter cannot reliably tell whether "#..." is a comment or a value
+        (it can be both).  Use is_value and is_comment instead for discriminating
+        token types.
+
+        For value tokens, this the concrete value to be omitted.
+
+        For comment token, this is the full comment text.
+
+        This is the same as str(token).
+        """
+        return self._text
+
+    def __str__(self):
+        # type: () -> str
+        return self._text
+
+    def __repr__(self):
+        # type: () -> str
+        return "{}({!r}, {}=True)".format(
+            self.__class__.__name__, self._text, self._content_type
+        )
+
+
+SPACE_SEPARATOR_FT = FormatterContentToken(" ", _CONTENT_TYPE_SEPARATOR)
+COMMA_SEPARATOR_FT = FormatterContentToken(",", _CONTENT_TYPE_SEPARATOR)
+
+
+def one_value_per_line_formatter(
+    indentation,  # type: Union[int, Literal["FIELD_NAME_LENGTH"]]
+    trailing_separator=True,  # type: bool
+    immediate_empty_line=False,  # type: bool
+):
+    # type: (...) -> FormatterCallback
+    """Provide a simple formatter that can handle indentation and trailing separators
+
+    All formatters returned by this function puts exactly one value per line.  This
+    pattern is commonly seen in the "Depends" field and similar fields of
+    debian/control files.
+
+    :param indentation: Either the literal string "FIELD_NAME_LENGTH" or a positive
+    integer, which determines the indentation for fields.  If it is an integer,
+    then a fixed indentation is used (notably the value 1 ensures the shortest
+    possible indentation).  Otherwise, if it is "FIELD_NAME_LENGTH", then the
+    indentation is set such that it aligns the values based on the field name.
+    :param trailing_separator: If True, then the last value will have a trailing
+    separator token (e.g., ",") after it.
+    :param immediate_empty_line: Whether the value should always start with an
+    empty line.  If True, then the result becomes something like "Field:\n value".
+
+    """
+    if indentation != "FIELD_NAME_LENGTH" and indentation < 1:
+        raise ValueError('indentation must be at least 1 (or "FIELD_NAME_LENGTH")')
+
+    def _formatter(
+        name,  # type: str
+        sep_token,  # type: FormatterContentToken
+        formatter_tokens,  # type: Iterator[FormatterContentToken]
+    ):
+        # type: (...) -> Iterator[Union[FormatterContentToken, str]]
+        if indentation == "FIELD_NAME_LENGTH":
+            indent_len = len(name) + 2
+        else:
+            indent_len = indentation
+        indent = " " * indent_len
+
+        emitted_first_line = False
+        tok_iter = BufferingIterator(formatter_tokens)
+        is_value = operator.attrgetter("is_value")
+        if immediate_empty_line:
+            emitted_first_line = True
+            yield "\n"
+        for t in tok_iter:
+            if t.is_comment:
+                if not emitted_first_line:
+                    yield "\n"
+                yield t
+            elif t.is_value:
+                if not emitted_first_line:
+                    yield " "
+                else:
+                    yield indent
+                yield t
+                if not sep_token.is_whitespace and (
+                    trailing_separator or tok_iter.peek_find(is_value)
+                ):
+                    yield sep_token
+                yield "\n"
+            else:
+                # Skip existing separators (etc.)
+                continue
+            emitted_first_line = True
+
+    return _formatter
+
+
+one_value_per_line_trailing_separator = one_value_per_line_formatter(
+    "FIELD_NAME_LENGTH", trailing_separator=True
+)
+
+
+def format_field(
+    formatter,  # type: FormatterCallback
+    field_name,  # type: str
+    separator_token,  # type: FormatterContentToken
+    token_iter,  # type: Iterator[FormatterContentToken]
+):
+    # type: (...) -> str
+    """Format a field using a provided formatter
+
+    This function formats a series of tokens using the provided formatter.
+    It can be used as a standalone formatter engine and can be used in test
+    suites to validate third-party formatters (enabling them to test for
+    corner cases without involving parsing logic).
+
+    The formatter receives series of FormatterContentTokens (via the
+    token_iter) and is expected to yield one or more str or
+    FormatterContentTokens.  The calling function will combine all of
+    these into a single string, which will be used as the value.
+
+    The formatter is recommended to yield the provided value and comment
+    tokens interleaved with text segments of whitespace and separators
+    as part of its output.  If it preserve comment and value tokens, the
+    calling function can provide some runtime checks to catch bugs
+    (like the formatter turning a comment into a value because it forgot
+    to ensure that the comment was emitted directly after a newline
+    character).
+
+    When writing a formatter, please keep the following in mind:
+
+     * The output of the formatter is appended directly after the ":" separator.
+       Most formatters will want to emit either a space or a newline as the very
+       first character for readability.
+       (compare "Depends:foo\\n" to "Depends: foo\\n")
+
+     * The formatter must always end its output on a newline.  This is a design
+       choice of how the round-trip safe parser represent values that is imposed
+       on the formatter.
+
+     * It is often easier to discard/ignore all separator tokens from the
+       the provided token sequence and instead just yield separator tokens/str
+       where the formatter wants to place them.
+
+         - The formatter is strongly recommended to special-case formatting
+           for whitespace separators (check for `separator_token.is_whitespace`).
+
+           This is because space, tab and newline all counts as valid separators
+           and can all appear in the token sequence. If the original field uses
+           a mix of these separators it is likely to completely undermine the
+           desired result. Not to mention the additional complexity of handling
+           when a separator token happens to use the newline character which
+           affects how the formatter is supposed what comes after it
+           (see the rules for comments, empty lines and continuation line
+           markers).
+
+     * The formatter must remember to emit a "continuation line" marker
+       (typically a single space or tab) when emitting a value after
+       a newline or a comment. A `yield " "` is sufficient.
+
+        - The continuation line marker may be embedded inside a str
+          with other whitespace (such as the newline coming before it
+          or/and whitespace used for indentation purposes following
+          the marker).
+
+     * The formatter must not cause the output to contain completely
+       empty/whitespace lines as these cause syntax errors.  The first
+       line never counts as an empty line (as it will be appended after
+       the field name).
+
+     * Tokens must be discriminated via the `token.is_value` (etc.)
+       properties. Assuming that `token.text.startswith("#")` implies a
+       comment and similar stunts are wrong.  As an example, "#foo" is a
+       perfectly valid value in some contexts.
+
+     * Comment tokens *always* take up exactly one complete line including
+       the newline character at the end of the line. They must be emitted
+       directly after a newline character or another comment token.
+
+     * Special cases that are rare but can happen:
+
+       - Fields *can* start with comments and requires a formatter provided newline.
+         (Example: "Depends:\\n# Comment here\\n foo")
+
+       - Fields *can* start on a separator or have two separators in a row.
+         This is especially true for whitespace separated fields where every
+         whitespace counts as a separator, but it can also happen with other
+         separators (such as comma).
+
+       - Value tokens can contain whitespace (for non-whitespace separators).
+         When they do, the formatter must not attempt change nor "normalize"
+         the whitespace inside the value token as that might change how the
+         value is interpreted.  (If you want to normalize such whitespace,
+         the formatter is at the wrong abstraction level.  Instead, manipulate
+         the values directly in the value interpretation layer)
+
+    This function will provide *some* runtime checks of its input and the
+    output from the formatter to detect some errors early and provide
+    helpful diagnostics.  If you use the function for testing, you are
+    recommended to rely on verifying the output of the function rather than
+    relying on the runtime checks (as these are subject to change).
+
+    :param formatter: A formatter (see FormatterCallback for the type).
+    Basic formatting is provided via one_value_per_line_trailing_separator
+    (a formatter) or one_value_per_line_formatter (a formatter generator).
+    :param field_name: The name of the field.
+    :param separator_token: One of SPACE_SEPARATOR and COMMA_SEPARATOR
+    :param token_iter: An iterable of tokens to be formatted.
+
+    The following example shows how to define a formatter_callback along with
+    a few verifications.
+
+    >>> fmt_field_len_sep = one_value_per_line_trailing_separator
+    >>> fmt_shortest = one_value_per_line_formatter(
+    ...   1,
+    ...   trailing_separator=False
+    ... )
+    >>> fmt_newline_first = one_value_per_line_formatter(
+    ...   1,
+    ...   trailing_separator=False,
+    ...   immediate_empty_line=True
+    ... )
+    >>> # Omit separator tokens for in the token list for simplicity (the formatter does
+    >>> # not use them, and it enables us to keep the example simple by reusing the list)
+    >>> tokens = [
+    ...     FormatterContentToken.value_token("foo"),
+    ...     FormatterContentToken.comment_token("# some comment about bar\\n"),
+    ...     FormatterContentToken.value_token("bar"),
+    ... ]
+    >>> # Starting with fmt_dl_ts
+    >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens), end='')
+    Depends: foo,
+    # some comment about bar
+             bar,
+    >>> print(format_field(fmt_field_len_sep, "Architecture", SPACE_SEPARATOR_FT, tokens), end='')
+    Architecture: foo
+    # some comment about bar
+                  bar
+    >>> # Control check for the special case where the field starts with a comment
+    >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='')
+    Depends:
+    # some comment about bar
+             bar,
+    >>> # Also, check single line values (to ensure it ends on a newline)
+    >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='')
+    Depends: bar,
+    >>> ### Changing format to the shortest length
+    >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens), end='')
+    Depends: foo,
+    # some comment about bar
+     bar
+    >>> print(format_field(fmt_shortest, "Architecture", SPACE_SEPARATOR_FT, tokens), end='')
+    Architecture: foo
+    # some comment about bar
+     bar
+    >>> # Control check for the special case where the field starts with a comment
+    >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='')
+    Depends:
+    # some comment about bar
+     bar
+    >>> # Also, check single line values (to ensure it ends on a newline)
+    >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='')
+    Depends: bar
+    >>> ### Changing format to the newline first format
+    >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens), end='')
+    Depends:
+     foo,
+    # some comment about bar
+     bar
+    >>> print(format_field(fmt_newline_first, "Architecture", SPACE_SEPARATOR_FT, tokens), end='')
+    Architecture:
+     foo
+    # some comment about bar
+     bar
+    >>> # Control check for the special case where the field starts with a comment
+    >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='')
+    Depends:
+    # some comment about bar
+     bar
+    >>> # Also, check single line values (to ensure it ends on a newline)
+    >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='')
+    Depends:
+     bar
+    """
+    formatted_tokens = [field_name, ":"]
+    just_after_newline = False
+    last_was_value_token = False
+    if isinstance(token_iter, list):
+        # Stop people from using this to test known "invalid" cases.
+        last_token = token_iter[-1]
+        if last_token.is_comment:
+            raise ValueError(
+                "Invalid token_iter: Field values cannot end with comments"
+            )
+    for token in formatter(field_name, separator_token, token_iter):
+        token_as_text = str(token)
+        # If we are given formatter tokens, then use them to verify the output.
+        if isinstance(token, FormatterContentToken):
+            if token.is_comment:
+                if not just_after_newline:
+                    raise ValueError(
+                        "Bad format: Comments must appear directly after a newline."
+                    )
+                # for the sake of ensuring people use proper test data.
+                if not token_as_text.startswith("#"):
+                    raise ValueError("Invalid Comment token: Must start with #")
+                if not token_as_text.endswith("\n"):
+                    raise ValueError("Invalid Comment token: Must end on a newline")
+            elif token.is_value:
+                if token_as_text[0].isspace() or token_as_text[-1].isspace():
+                    raise ValueError(
+                        "Invalid Value token: It cannot start nor end on whitespace"
+                    )
+                if just_after_newline:
+                    raise ValueError("Bad format: Missing continuation line marker")
+                if last_was_value_token:
+                    raise ValueError("Bad format: Formatter omitted a separator")
+
+            last_was_value_token = token.is_value
+        else:
+            last_was_value_token = False
+
+        if just_after_newline:
+            if token_as_text[0] in ("\r", "\n"):
+                raise ValueError("Bad format: Saw completely empty line.")
+            if not token_as_text[0].isspace() and not token_as_text.startswith("#"):
+                raise ValueError("Bad format: Saw completely empty line.")
+        formatted_tokens.append(token_as_text)
+        just_after_newline = token_as_text.endswith("\n")
+
+    formatted_text = "".join(formatted_tokens)
+    if not formatted_text.endswith("\n"):
+        raise ValueError("Bad format: The field value must end on a newline")
+    return formatted_text