Coverage for src/debputy/lsp/vendoring/_deb822_repro/tokens.py: 83%
230 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-04-07 12:14 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2024-04-07 12:14 +0200
1import re
2import sys
3import weakref
4from weakref import ReferenceType
6from ._util import BufferingIterator
7from .locatable import (
8 Locatable,
9 START_POSITION,
10 Range,
11 ONE_CHAR_RANGE,
12 ONE_LINE_RANGE,
13 Position,
14)
15from debian._util import resolve_ref, _strI
17try:
18 from typing import Optional, cast, TYPE_CHECKING, Iterable, Union, Dict, Callable
19except ImportError:
20 # pylint: disable=unnecessary-lambda-assignment
21 TYPE_CHECKING = False
22 cast = lambda t, v: v
24if TYPE_CHECKING:
25 from .parsing import Deb822Element
28# Consume whitespace and a single word.
29_RE_WHITESPACE_SEPARATED_WORD_LIST = re.compile(
30 r"""
31 (?P<space_before>\s*) # Consume any whitespace before the word
32 # The space only occurs in practise if the line starts
33 # with space.
35 # Optionally consume a word (needed to handle the case
36 # when there are no words left and someone applies this
37 # pattern to the remaining text). This is mostly here as
38 # a fail-safe.
40 (?P<word>\S+) # Consume the word (if present)
41 (?P<trailing_whitespace>\s*) # Consume trailing whitespace
42""",
43 re.VERBOSE,
44)
45_RE_COMMA_SEPARATED_WORD_LIST = re.compile(
46 r"""
47 # This regex is slightly complicated by the fact that it should work with
48 # finditer and consume the entire value.
49 #
50 # To do this, we structure the regex so it always starts on a comma (except
51 # for the first iteration, where we permit the absence of a comma)
53 (?: # Optional space followed by a mandatory comma unless
54 # it is the start of the "line" (in which case, we
55 # allow the comma to be omitted)
56 ^
57 |
58 (?:
59 (?P<space_before_comma>\s*) # This space only occurs in practise if the line
60 # starts with space + comma.
61 (?P<comma> ,)
62 )
63 )
65 # From here it is "optional space, maybe a word and then optional space" again. One reason why
66 # all of it is optional is to gracefully cope with trailing commas.
67 (?P<space_before_word>\s*)
68 (?P<word> [^,\s] (?: [^,]*[^,\s])? )? # "Words" can contain spaces for comma separated list.
69 # But surrounding whitespace is ignored
70 (?P<space_after_word>\s*)
71""",
72 re.VERBOSE,
73)
75# From Policy 5.1:
76#
77# The field name is composed of US-ASCII characters excluding control
78# characters, space, and colon (i.e., characters in the ranges U+0021
79# (!) through U+0039 (9), and U+003B (;) through U+007E (~),
80# inclusive). Field names must not begin with the comment character
81# (U+0023 #), nor with the hyphen character (U+002D -).
82#
83# That combines to this regex of questionable readability
84_RE_FIELD_LINE = re.compile(
85 r"""
86 ^ # Start of line
87 (?P<field_name> # Capture group for the field name
88 [\x21\x22\x24-\x2C\x2F-\x39\x3B-\x7F] # First character
89 [\x21-\x39\x3B-\x7F]* # Subsequent characters (if any)
90 )
91 (?P<separator> : )
92 (?P<space_before_value> \s* )
93 (?: # Field values are not mandatory on the same line
94 # as the field name.
96 (?P<value> \S(?:.*\S)? ) # Values must start and end on a "non-space"
97 (?P<space_after_value> \s* ) # We can have optional space after the value
98 )?
99""",
100 re.VERBOSE,
101)
104class Deb822Token(Locatable):
105 """A token is an atomic syntactical element from a deb822 file
107 A file is parsed into a series of tokens. If these tokens are converted to
108 text in exactly the same order, you get exactly the same file - bit-for-bit.
109 Accordingly ever bit of text in a file must be assigned to exactly one
110 Deb822Token.
111 """
113 __slots__ = ("_text", "_parent_element", "_token_size", "__weakref__")
115 def __init__(self, text):
116 # type: (str) -> None
117 if text == "": # pragma: no cover
118 raise ValueError("Tokens must have content")
119 self._text = text # type: str
120 self._parent_element = None # type: Optional[ReferenceType['Deb822Element']]
121 self._token_size = None # type: Optional[Range]
122 self._verify_token_text()
124 def __repr__(self):
125 # type: () -> str
126 return "{clsname}('{text}')".format(
127 clsname=self.__class__.__name__, text=self._text.replace("\n", "\\n")
128 )
130 def _verify_token_text(self):
131 # type: () -> None
132 if "\n" in self._text:
133 is_single_line_token = False
134 if self.is_comment or self.is_error:
135 is_single_line_token = True
136 if not is_single_line_token and not self.is_whitespace: 136 ↛ 137line 136 didn't jump to line 137, because the condition on line 136 was never true
137 raise ValueError(
138 "Only whitespace, error and comment tokens may contain newlines"
139 )
140 if not self.text.endswith("\n"): 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true
141 raise ValueError("Tokens containing whitespace must end on a newline")
142 if is_single_line_token and "\n" in self.text[:-1]: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true
143 raise ValueError(
144 "Comments and error tokens must not contain embedded newlines"
145 " (only end on one)"
146 )
148 @property
149 def is_whitespace(self):
150 # type: () -> bool
151 return False
153 @property
154 def is_comment(self):
155 # type: () -> bool
156 return False
158 @property
159 def is_error(self):
160 # type: () -> bool
161 return False
163 @property
164 def text(self):
165 # type: () -> str
166 return self._text
168 # To support callers that want a simple interface for converting tokens and elements to text
169 def convert_to_text(self):
170 # type: () -> str
171 return self._text
173 def size(self, *, skip_leading_comments: bool = False) -> Range:
174 # As tokens are an atomic unit
175 token_size = self._token_size
176 if token_size is not None:
177 return token_size
178 token_len = len(self._text)
179 if token_len == 1:
180 # The indirection with `r` because mypy gets confused and thinks that `token_size`
181 # cannot have any type at all.
182 token_size = ONE_CHAR_RANGE if self._text != "\n" else ONE_LINE_RANGE
183 else:
184 new_lines = self._text.count("\n")
185 assert not new_lines or self._text[-1] == "\n"
186 end_pos = Position(new_lines, 0) if new_lines else Position(0, token_len)
187 token_size = Range(START_POSITION, end_pos)
188 self._token_size = token_size
189 return token_size
191 @property
192 def parent_element(self):
193 # type: () -> Optional[Deb822Element]
194 return resolve_ref(self._parent_element)
196 @parent_element.setter
197 def parent_element(self, new_parent):
198 # type: (Optional[Deb822Element]) -> None
199 self._parent_element = (
200 weakref.ref(new_parent) if new_parent is not None else None
201 )
203 def clear_parent_if_parent(self, parent):
204 # type: (Deb822Element) -> None
205 if parent is self.parent_element:
206 self._parent_element = None
209class Deb822WhitespaceToken(Deb822Token):
210 """The token is a kind of whitespace.
212 Some whitespace tokens are critical for the format (such as the Deb822ValueContinuationToken,
213 spaces that separate words in list separated by spaces or newlines), while other whitespace
214 tokens are truly insignificant (space before a newline, space after a comma in a comma
215 list, etc.).
216 """
218 __slots__ = ()
220 @property
221 def is_whitespace(self):
222 # type: () -> bool
223 return True
226class Deb822SemanticallySignificantWhiteSpace(Deb822WhitespaceToken):
227 """Whitespace that (if removed) would change the meaning of the file (or cause syntax errors)"""
229 __slots__ = ()
232class Deb822NewlineAfterValueToken(Deb822SemanticallySignificantWhiteSpace):
233 """The newline after a value token.
235 If not followed by a continuation token, this also marks the end of the field.
236 """
238 __slots__ = ()
240 def __init__(self):
241 # type: () -> None
242 super().__init__("\n")
245class Deb822ValueContinuationToken(Deb822SemanticallySignificantWhiteSpace):
246 """The whitespace denoting a value spanning an additional line (the first space on a line)"""
248 __slots__ = ()
251class Deb822SpaceSeparatorToken(Deb822SemanticallySignificantWhiteSpace):
252 """Whitespace between values in a space list (e.g. "Architectures")"""
254 __slots__ = ()
257class Deb822ErrorToken(Deb822Token):
258 """Token that represents a syntactical error"""
260 __slots__ = ()
262 @property
263 def is_error(self):
264 # type: () -> bool
265 return True
268class Deb822CommentToken(Deb822Token):
270 __slots__ = ()
272 @property
273 def is_comment(self):
274 # type: () -> bool
275 return True
278class Deb822FieldNameToken(Deb822Token):
280 __slots__ = ()
282 def __init__(self, text):
283 # type: (str) -> None
284 if not isinstance(text, _strI): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true
285 text = _strI(sys.intern(text))
286 super().__init__(text)
288 @property
289 def text(self):
290 # type: () -> _strI
291 return cast("_strI", self._text)
294# The colon after the field name, parenthesis, etc.
295class Deb822SeparatorToken(Deb822Token):
297 __slots__ = ()
300class Deb822FieldSeparatorToken(Deb822Token):
302 __slots__ = ()
304 def __init__(self):
305 # type: () -> None
306 super().__init__(":")
309class Deb822CommaToken(Deb822SeparatorToken):
310 """Used by the comma-separated list value parsers to denote a comma between two value tokens."""
312 __slots__ = ()
314 def __init__(self):
315 # type: () -> None
316 super().__init__(",")
319class Deb822PipeToken(Deb822SeparatorToken):
320 """Used in some dependency fields as OR relation"""
322 __slots__ = ()
324 def __init__(self):
325 # type: () -> None
326 super().__init__("|")
329class Deb822ValueToken(Deb822Token):
330 """A field value can be split into multi "Deb822ValueToken"s (as well as separator tokens)"""
332 __slots__ = ()
335class Deb822ValueDependencyToken(Deb822Token):
336 """Package name, architecture name, a version number, or a profile name in a dependency field"""
338 __slots__ = ()
341class Deb822ValueDependencyVersionRelationOperatorToken(Deb822Token):
343 __slots__ = ()
346def tokenize_deb822_file(sequence, encoding="utf-8"):
347 # type: (Iterable[Union[str, bytes]], str) -> Iterable[Deb822Token]
348 """Tokenize a deb822 file
350 :param sequence: An iterable of lines (a file open for reading will do)
351 :param encoding: The encoding to use (this is here to support Deb822-like
352 APIs, new code should not use this parameter).
353 """
354 current_field_name = None
355 field_name_cache = {} # type: Dict[str, _strI]
357 def _normalize_input(s):
358 # type: (Iterable[Union[str, bytes]]) -> Iterable[str]
359 for x in s:
360 if isinstance(x, bytes): 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true
361 x = x.decode(encoding)
362 if not x.endswith("\n"):
363 # We always end on a newline because it makes a lot of code simpler. The pain
364 # points relates to mutations that add content after the last field. Sadly, these
365 # mutations can happen via adding fields, reordering fields, etc. and are too hard
366 # to track to make it worth it to support the special case that makes up missing
367 # a newline at the end of the file.
368 x += "\n"
369 yield x
371 text_stream = BufferingIterator(
372 _normalize_input(sequence)
373 ) # type: BufferingIterator[str]
375 for line in text_stream:
376 if line.isspace():
377 if current_field_name:
378 # Blank lines terminate fields
379 current_field_name = None
381 # If there are multiple whitespace-only lines, we combine them
382 # into one token.
383 r = list(text_stream.takewhile(str.isspace))
384 if r: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true
385 line += "".join(r)
387 # whitespace tokens are likely to have duplicate cases (like
388 # single newline tokens), so we intern the strings there.
389 yield Deb822WhitespaceToken(sys.intern(line))
390 continue
392 if line[0] == "#":
393 yield Deb822CommentToken(line)
394 continue
396 if line[0] in (" ", "\t"):
397 if current_field_name is not None: 397 ↛ 407line 397 didn't jump to line 407, because the condition on line 397 was never false
398 # We emit a separate whitespace token for the newline as it makes some
399 # things easier later (see _build_value_line)
400 leading = sys.intern(line[0])
401 # Pull out the leading space and newline
402 line = line[1:-1]
403 yield Deb822ValueContinuationToken(leading)
404 yield Deb822ValueToken(line)
405 yield Deb822NewlineAfterValueToken()
406 else:
407 yield Deb822ErrorToken(line)
408 continue
410 field_line_match = _RE_FIELD_LINE.match(line)
411 if field_line_match: 411 ↛ 455line 411 didn't jump to line 455, because the condition on line 411 was never false
412 # The line is a field, which means there is a bit to unpack
413 # - note that by definition, leading and trailing whitespace is insignificant
414 # on the value part directly after the field separator
415 (field_name, _, space_before, value, space_after) = (
416 field_line_match.groups()
417 )
419 current_field_name = field_name_cache.get(field_name)
421 if value is None or value == "": 421 ↛ 424line 421 didn't jump to line 424
422 # If there is no value, then merge the two space elements into space_after
423 # as it makes it easier to handle the newline.
424 space_after = (
425 space_before + space_after if space_after else space_before
426 )
427 space_before = ""
429 if space_after: 429 ↛ 435line 429 didn't jump to line 435, because the condition on line 429 was never false
430 # We emit a separate whitespace token for the newline as it makes some
431 # things easier later (see _build_value_line)
432 if space_after.endswith("\n"): 432 ↛ 435line 432 didn't jump to line 435, because the condition on line 432 was never false
433 space_after = space_after[:-1]
435 if current_field_name is None: 435 ↛ 443line 435 didn't jump to line 443, because the condition on line 435 was never false
436 field_name = sys.intern(field_name)
437 current_field_name = _strI(field_name)
438 field_name_cache[field_name] = current_field_name
440 # We use current_field_name from here as it is a _strI.
441 # Delete field_name to avoid accidentally using it and getting bugs
442 # that should not happen.
443 del field_name
445 yield Deb822FieldNameToken(current_field_name)
446 yield Deb822FieldSeparatorToken()
447 if space_before:
448 yield Deb822WhitespaceToken(sys.intern(space_before))
449 if value: 449 ↛ 451line 449 didn't jump to line 451, because the condition on line 449 was never false
450 yield Deb822ValueToken(value)
451 if space_after:
452 yield Deb822WhitespaceToken(sys.intern(space_after))
453 yield Deb822NewlineAfterValueToken()
454 else:
455 yield Deb822ErrorToken(line)
458def _value_line_tokenizer(func):
459 # type: (Callable[[str], Iterable[Deb822Token]]) -> (Callable[[str], Iterable[Deb822Token]])
460 def impl(v):
461 # type: (str) -> Iterable[Deb822Token]
462 first_line = True
463 for no, line in enumerate(v.splitlines(keepends=True)):
464 assert not v.isspace() or no == 0
465 if line.startswith("#"):
466 yield Deb822CommentToken(line)
467 continue
468 has_newline = False
469 continuation_line_marker = None
470 if not first_line:
471 continuation_line_marker = line[0]
472 line = line[1:]
473 first_line = False
474 if line.endswith("\n"):
475 has_newline = True
476 line = line[:-1]
477 if continuation_line_marker is not None:
478 yield Deb822ValueContinuationToken(sys.intern(continuation_line_marker))
479 yield from func(line)
480 if has_newline:
481 yield Deb822NewlineAfterValueToken()
483 return impl
486@_value_line_tokenizer
487def whitespace_split_tokenizer(v):
488 # type: (str) -> Iterable[Deb822Token]
489 assert "\n" not in v
490 for match in _RE_WHITESPACE_SEPARATED_WORD_LIST.finditer(v):
491 space_before, word, space_after = match.groups()
492 if space_before:
493 yield Deb822SpaceSeparatorToken(sys.intern(space_before))
494 yield Deb822ValueToken(word)
495 if space_after:
496 yield Deb822SpaceSeparatorToken(sys.intern(space_after))
499@_value_line_tokenizer
500def comma_split_tokenizer(v):
501 # type: (str) -> Iterable[Deb822Token]
502 assert "\n" not in v
503 for match in _RE_COMMA_SEPARATED_WORD_LIST.finditer(v):
504 space_before_comma, comma, space_before_word, word, space_after_word = (
505 match.groups()
506 )
507 if space_before_comma:
508 yield Deb822WhitespaceToken(sys.intern(space_before_comma))
509 if comma:
510 yield Deb822CommaToken()
511 if space_before_word:
512 yield Deb822WhitespaceToken(sys.intern(space_before_word))
513 if word:
514 yield Deb822ValueToken(word)
515 if space_after_word:
516 yield Deb822WhitespaceToken(sys.intern(space_after_word))