diff options
Diffstat (limited to 'src/debputy/lsp/vendoring/_deb822_repro/formatter.py')
-rw-r--r-- | src/debputy/lsp/vendoring/_deb822_repro/formatter.py | 478 |
1 files changed, 478 insertions, 0 deletions
diff --git a/src/debputy/lsp/vendoring/_deb822_repro/formatter.py b/src/debputy/lsp/vendoring/_deb822_repro/formatter.py new file mode 100644 index 0000000..a2b797b --- /dev/null +++ b/src/debputy/lsp/vendoring/_deb822_repro/formatter.py @@ -0,0 +1,478 @@ +import operator + +from ._util import BufferingIterator +from .tokens import Deb822Token + +# Consider these "opaque" enum-like values. The actual value was chosen to +# make repr easier to implement, but they are subject to change. +_CONTENT_TYPE_VALUE = "is_value" +_CONTENT_TYPE_COMMENT = "is_comment" +_CONTENT_TYPE_SEPARATOR = "is_separator" + +try: + from typing import Iterator, Union, Literal + from .types import TokenOrElement, FormatterCallback +except ImportError: + pass + + +class FormatterContentToken(object): + """Typed, tagged text for use with the formatting API + + The FormatterContentToken is used by the formatting API and provides the + formatter callback with context about the textual tokens it is supposed + to format. + """ + + __slots__ = ("_text", "_content_type") + + def __init__(self, text, content_type): + # type: (str, object) -> None + self._text = text + self._content_type = content_type + + @classmethod + def from_token_or_element(cls, token_or_element): + # type: (TokenOrElement) -> FormatterContentToken + if isinstance(token_or_element, Deb822Token): + if token_or_element.is_comment: + return cls.comment_token(token_or_element.text) + if token_or_element.is_whitespace: + raise ValueError("FormatterContentType cannot be whitespace") + return cls.value_token(token_or_element.text) + # Elements are assumed to be content (this is specialized for the + # interpretations where comments are always tokens). + return cls.value_token(token_or_element.convert_to_text()) + + @classmethod + def separator_token(cls, text): + # type: (str) -> FormatterContentToken + # Special-case separators as a minor memory optimization + if text == " ": + return SPACE_SEPARATOR_FT + if text == ",": + return COMMA_SEPARATOR_FT + return cls(text, _CONTENT_TYPE_SEPARATOR) + + @classmethod + def comment_token(cls, text): + # type: (str) -> FormatterContentToken + """Generates a single comment token with the provided text + + Mostly useful for creating test cases + """ + return cls(text, _CONTENT_TYPE_COMMENT) + + @classmethod + def value_token(cls, text): + # type: (str) -> FormatterContentToken + """Generates a single value token with the provided text + + Mostly useful for creating test cases + """ + return cls(text, _CONTENT_TYPE_VALUE) + + @property + def is_comment(self): + # type: () -> bool + """True if this formatter token represent a comment + + This should be used for determining whether the token is a comment + or not. It might be tempting to check whether the text in the token + starts with a "#" but that is insufficient because a value *can* + start with that as well. Whether it is a comment or a value is + based on the context (it is a comment if and only if the "#" was + at the start of a line) but the formatter often do not have the + context available to assert this. + + The formatter *should* preserve the order of comments and interleave + between the value tokens in the same order as it see them. Failing + to preserve the order of comments and values can cause confusing + comments (such as associating the comment with a different value + than it was written for). + + The formatter *may* discard comment tokens if it does not want to + preserve them. If so, they would be omitted in the output, which + may be acceptable in some cases. This is a lot better than + re-ordering comments. + + Formatters must be aware of the following special cases for comments: + * Comments *MUST* be emitted after a newline. If the very first token + is a comment, the formatter is expected to emit a newline before it + as well (Fields cannot start immediately on a comment). + """ + return self._content_type is _CONTENT_TYPE_COMMENT + + @property + def is_value(self): + # type: () -> bool + """True if this formatter token represents a semantic value + + The formatter *MUST* preserve values as-in in its output. It may + "unpack" it from the token (as in, return it as a part of a plain + str) but the value content must not be changed nor re-ordered relative + to other value tokens (as that could change the meaning of the field). + """ + return self._content_type is _CONTENT_TYPE_VALUE + + @property + def is_separator(self): + # type: () -> bool + """True if this formatter token represents a separator token + + The formatter is not required to preserve the provided separators but it + is required to properly separate values. In fact, often is a lot easier + to discard existing separator tokens. As an example, in whitespace + separated list of values space, tab and newline all counts as separator. + However, formatting-wise, there is a world of difference between the + a space, tab and a newline. In particularly, newlines must be followed + by an additional space or tab (to act as a value continuation line) if + there is a value following it (otherwise, the generated output is + invalid). + """ + return self._content_type is _CONTENT_TYPE_SEPARATOR + + @property + def is_whitespace(self): + # type: () -> bool + """True if this formatter token represents a whitespace token""" + return self._content_type is _CONTENT_TYPE_SEPARATOR and self._text.isspace() + + @property + def text(self): + # type: () -> str + """The actual context of the token + + This field *must not* be used to determine the type of token. The + formatter cannot reliably tell whether "#..." is a comment or a value + (it can be both). Use is_value and is_comment instead for discriminating + token types. + + For value tokens, this the concrete value to be omitted. + + For comment token, this is the full comment text. + + This is the same as str(token). + """ + return self._text + + def __str__(self): + # type: () -> str + return self._text + + def __repr__(self): + # type: () -> str + return "{}({!r}, {}=True)".format( + self.__class__.__name__, self._text, self._content_type + ) + + +SPACE_SEPARATOR_FT = FormatterContentToken(" ", _CONTENT_TYPE_SEPARATOR) +COMMA_SEPARATOR_FT = FormatterContentToken(",", _CONTENT_TYPE_SEPARATOR) + + +def one_value_per_line_formatter( + indentation, # type: Union[int, Literal["FIELD_NAME_LENGTH"]] + trailing_separator=True, # type: bool + immediate_empty_line=False, # type: bool +): + # type: (...) -> FormatterCallback + """Provide a simple formatter that can handle indentation and trailing separators + + All formatters returned by this function puts exactly one value per line. This + pattern is commonly seen in the "Depends" field and similar fields of + debian/control files. + + :param indentation: Either the literal string "FIELD_NAME_LENGTH" or a positive + integer, which determines the indentation for fields. If it is an integer, + then a fixed indentation is used (notably the value 1 ensures the shortest + possible indentation). Otherwise, if it is "FIELD_NAME_LENGTH", then the + indentation is set such that it aligns the values based on the field name. + :param trailing_separator: If True, then the last value will have a trailing + separator token (e.g., ",") after it. + :param immediate_empty_line: Whether the value should always start with an + empty line. If True, then the result becomes something like "Field:\n value". + + """ + if indentation != "FIELD_NAME_LENGTH" and indentation < 1: + raise ValueError('indentation must be at least 1 (or "FIELD_NAME_LENGTH")') + + def _formatter( + name, # type: str + sep_token, # type: FormatterContentToken + formatter_tokens, # type: Iterator[FormatterContentToken] + ): + # type: (...) -> Iterator[Union[FormatterContentToken, str]] + if indentation == "FIELD_NAME_LENGTH": + indent_len = len(name) + 2 + else: + indent_len = indentation + indent = " " * indent_len + + emitted_first_line = False + tok_iter = BufferingIterator(formatter_tokens) + is_value = operator.attrgetter("is_value") + if immediate_empty_line: + emitted_first_line = True + yield "\n" + for t in tok_iter: + if t.is_comment: + if not emitted_first_line: + yield "\n" + yield t + elif t.is_value: + if not emitted_first_line: + yield " " + else: + yield indent + yield t + if not sep_token.is_whitespace and ( + trailing_separator or tok_iter.peek_find(is_value) + ): + yield sep_token + yield "\n" + else: + # Skip existing separators (etc.) + continue + emitted_first_line = True + + return _formatter + + +one_value_per_line_trailing_separator = one_value_per_line_formatter( + "FIELD_NAME_LENGTH", trailing_separator=True +) + + +def format_field( + formatter, # type: FormatterCallback + field_name, # type: str + separator_token, # type: FormatterContentToken + token_iter, # type: Iterator[FormatterContentToken] +): + # type: (...) -> str + """Format a field using a provided formatter + + This function formats a series of tokens using the provided formatter. + It can be used as a standalone formatter engine and can be used in test + suites to validate third-party formatters (enabling them to test for + corner cases without involving parsing logic). + + The formatter receives series of FormatterContentTokens (via the + token_iter) and is expected to yield one or more str or + FormatterContentTokens. The calling function will combine all of + these into a single string, which will be used as the value. + + The formatter is recommended to yield the provided value and comment + tokens interleaved with text segments of whitespace and separators + as part of its output. If it preserve comment and value tokens, the + calling function can provide some runtime checks to catch bugs + (like the formatter turning a comment into a value because it forgot + to ensure that the comment was emitted directly after a newline + character). + + When writing a formatter, please keep the following in mind: + + * The output of the formatter is appended directly after the ":" separator. + Most formatters will want to emit either a space or a newline as the very + first character for readability. + (compare "Depends:foo\\n" to "Depends: foo\\n") + + * The formatter must always end its output on a newline. This is a design + choice of how the round-trip safe parser represent values that is imposed + on the formatter. + + * It is often easier to discard/ignore all separator tokens from the + the provided token sequence and instead just yield separator tokens/str + where the formatter wants to place them. + + - The formatter is strongly recommended to special-case formatting + for whitespace separators (check for `separator_token.is_whitespace`). + + This is because space, tab and newline all counts as valid separators + and can all appear in the token sequence. If the original field uses + a mix of these separators it is likely to completely undermine the + desired result. Not to mention the additional complexity of handling + when a separator token happens to use the newline character which + affects how the formatter is supposed what comes after it + (see the rules for comments, empty lines and continuation line + markers). + + * The formatter must remember to emit a "continuation line" marker + (typically a single space or tab) when emitting a value after + a newline or a comment. A `yield " "` is sufficient. + + - The continuation line marker may be embedded inside a str + with other whitespace (such as the newline coming before it + or/and whitespace used for indentation purposes following + the marker). + + * The formatter must not cause the output to contain completely + empty/whitespace lines as these cause syntax errors. The first + line never counts as an empty line (as it will be appended after + the field name). + + * Tokens must be discriminated via the `token.is_value` (etc.) + properties. Assuming that `token.text.startswith("#")` implies a + comment and similar stunts are wrong. As an example, "#foo" is a + perfectly valid value in some contexts. + + * Comment tokens *always* take up exactly one complete line including + the newline character at the end of the line. They must be emitted + directly after a newline character or another comment token. + + * Special cases that are rare but can happen: + + - Fields *can* start with comments and requires a formatter provided newline. + (Example: "Depends:\\n# Comment here\\n foo") + + - Fields *can* start on a separator or have two separators in a row. + This is especially true for whitespace separated fields where every + whitespace counts as a separator, but it can also happen with other + separators (such as comma). + + - Value tokens can contain whitespace (for non-whitespace separators). + When they do, the formatter must not attempt change nor "normalize" + the whitespace inside the value token as that might change how the + value is interpreted. (If you want to normalize such whitespace, + the formatter is at the wrong abstraction level. Instead, manipulate + the values directly in the value interpretation layer) + + This function will provide *some* runtime checks of its input and the + output from the formatter to detect some errors early and provide + helpful diagnostics. If you use the function for testing, you are + recommended to rely on verifying the output of the function rather than + relying on the runtime checks (as these are subject to change). + + :param formatter: A formatter (see FormatterCallback for the type). + Basic formatting is provided via one_value_per_line_trailing_separator + (a formatter) or one_value_per_line_formatter (a formatter generator). + :param field_name: The name of the field. + :param separator_token: One of SPACE_SEPARATOR and COMMA_SEPARATOR + :param token_iter: An iterable of tokens to be formatted. + + The following example shows how to define a formatter_callback along with + a few verifications. + + >>> fmt_field_len_sep = one_value_per_line_trailing_separator + >>> fmt_shortest = one_value_per_line_formatter( + ... 1, + ... trailing_separator=False + ... ) + >>> fmt_newline_first = one_value_per_line_formatter( + ... 1, + ... trailing_separator=False, + ... immediate_empty_line=True + ... ) + >>> # Omit separator tokens for in the token list for simplicity (the formatter does + >>> # not use them, and it enables us to keep the example simple by reusing the list) + >>> tokens = [ + ... FormatterContentToken.value_token("foo"), + ... FormatterContentToken.comment_token("# some comment about bar\\n"), + ... FormatterContentToken.value_token("bar"), + ... ] + >>> # Starting with fmt_dl_ts + >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens), end='') + Depends: foo, + # some comment about bar + bar, + >>> print(format_field(fmt_field_len_sep, "Architecture", SPACE_SEPARATOR_FT, tokens), end='') + Architecture: foo + # some comment about bar + bar + >>> # Control check for the special case where the field starts with a comment + >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='') + Depends: + # some comment about bar + bar, + >>> # Also, check single line values (to ensure it ends on a newline) + >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='') + Depends: bar, + >>> ### Changing format to the shortest length + >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens), end='') + Depends: foo, + # some comment about bar + bar + >>> print(format_field(fmt_shortest, "Architecture", SPACE_SEPARATOR_FT, tokens), end='') + Architecture: foo + # some comment about bar + bar + >>> # Control check for the special case where the field starts with a comment + >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='') + Depends: + # some comment about bar + bar + >>> # Also, check single line values (to ensure it ends on a newline) + >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='') + Depends: bar + >>> ### Changing format to the newline first format + >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens), end='') + Depends: + foo, + # some comment about bar + bar + >>> print(format_field(fmt_newline_first, "Architecture", SPACE_SEPARATOR_FT, tokens), end='') + Architecture: + foo + # some comment about bar + bar + >>> # Control check for the special case where the field starts with a comment + >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='') + Depends: + # some comment about bar + bar + >>> # Also, check single line values (to ensure it ends on a newline) + >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='') + Depends: + bar + """ + formatted_tokens = [field_name, ":"] + just_after_newline = False + last_was_value_token = False + if isinstance(token_iter, list): + # Stop people from using this to test known "invalid" cases. + last_token = token_iter[-1] + if last_token.is_comment: + raise ValueError( + "Invalid token_iter: Field values cannot end with comments" + ) + for token in formatter(field_name, separator_token, token_iter): + token_as_text = str(token) + # If we are given formatter tokens, then use them to verify the output. + if isinstance(token, FormatterContentToken): + if token.is_comment: + if not just_after_newline: + raise ValueError( + "Bad format: Comments must appear directly after a newline." + ) + # for the sake of ensuring people use proper test data. + if not token_as_text.startswith("#"): + raise ValueError("Invalid Comment token: Must start with #") + if not token_as_text.endswith("\n"): + raise ValueError("Invalid Comment token: Must end on a newline") + elif token.is_value: + if token_as_text[0].isspace() or token_as_text[-1].isspace(): + raise ValueError( + "Invalid Value token: It cannot start nor end on whitespace" + ) + if just_after_newline: + raise ValueError("Bad format: Missing continuation line marker") + if last_was_value_token: + raise ValueError("Bad format: Formatter omitted a separator") + + last_was_value_token = token.is_value + else: + last_was_value_token = False + + if just_after_newline: + if token_as_text[0] in ("\r", "\n"): + raise ValueError("Bad format: Saw completely empty line.") + if not token_as_text[0].isspace() and not token_as_text.startswith("#"): + raise ValueError("Bad format: Saw completely empty line.") + formatted_tokens.append(token_as_text) + just_after_newline = token_as_text.endswith("\n") + + formatted_text = "".join(formatted_tokens) + if not formatted_text.endswith("\n"): + raise ValueError("Bad format: The field value must end on a newline") + return formatted_text |