Coverage for src/debputy/lsp/vendoring/_deb822_repro/tokens.py: 83%

230 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-04-07 12:14 +0200

1import re 

2import sys 

3import weakref 

4from weakref import ReferenceType 

5 

6from ._util import BufferingIterator 

7from .locatable import ( 

8 Locatable, 

9 START_POSITION, 

10 Range, 

11 ONE_CHAR_RANGE, 

12 ONE_LINE_RANGE, 

13 Position, 

14) 

15from debian._util import resolve_ref, _strI 

16 

17try: 

18 from typing import Optional, cast, TYPE_CHECKING, Iterable, Union, Dict, Callable 

19except ImportError: 

20 # pylint: disable=unnecessary-lambda-assignment 

21 TYPE_CHECKING = False 

22 cast = lambda t, v: v 

23 

24if TYPE_CHECKING: 

25 from .parsing import Deb822Element 

26 

27 

28# Consume whitespace and a single word. 

29_RE_WHITESPACE_SEPARATED_WORD_LIST = re.compile( 

30 r""" 

31 (?P<space_before>\s*) # Consume any whitespace before the word 

32 # The space only occurs in practise if the line starts 

33 # with space. 

34 

35 # Optionally consume a word (needed to handle the case 

36 # when there are no words left and someone applies this 

37 # pattern to the remaining text). This is mostly here as 

38 # a fail-safe. 

39 

40 (?P<word>\S+) # Consume the word (if present) 

41 (?P<trailing_whitespace>\s*) # Consume trailing whitespace 

42""", 

43 re.VERBOSE, 

44) 

45_RE_COMMA_SEPARATED_WORD_LIST = re.compile( 

46 r""" 

47 # This regex is slightly complicated by the fact that it should work with 

48 # finditer and consume the entire value. 

49 # 

50 # To do this, we structure the regex so it always starts on a comma (except 

51 # for the first iteration, where we permit the absence of a comma) 

52 

53 (?: # Optional space followed by a mandatory comma unless 

54 # it is the start of the "line" (in which case, we 

55 # allow the comma to be omitted) 

56 ^ 

57 | 

58 (?: 

59 (?P<space_before_comma>\s*) # This space only occurs in practise if the line 

60 # starts with space + comma. 

61 (?P<comma> ,) 

62 ) 

63 ) 

64 

65 # From here it is "optional space, maybe a word and then optional space" again. One reason why 

66 # all of it is optional is to gracefully cope with trailing commas. 

67 (?P<space_before_word>\s*) 

68 (?P<word> [^,\s] (?: [^,]*[^,\s])? )? # "Words" can contain spaces for comma separated list. 

69 # But surrounding whitespace is ignored 

70 (?P<space_after_word>\s*) 

71""", 

72 re.VERBOSE, 

73) 

74 

75# From Policy 5.1: 

76# 

77# The field name is composed of US-ASCII characters excluding control 

78# characters, space, and colon (i.e., characters in the ranges U+0021 

79# (!) through U+0039 (9), and U+003B (;) through U+007E (~), 

80# inclusive). Field names must not begin with the comment character 

81# (U+0023 #), nor with the hyphen character (U+002D -). 

82# 

83# That combines to this regex of questionable readability 

84_RE_FIELD_LINE = re.compile( 

85 r""" 

86 ^ # Start of line 

87 (?P<field_name> # Capture group for the field name 

88 [\x21\x22\x24-\x2C\x2F-\x39\x3B-\x7F] # First character 

89 [\x21-\x39\x3B-\x7F]* # Subsequent characters (if any) 

90 ) 

91 (?P<separator> : ) 

92 (?P<space_before_value> \s* ) 

93 (?: # Field values are not mandatory on the same line 

94 # as the field name. 

95 

96 (?P<value> \S(?:.*\S)? ) # Values must start and end on a "non-space" 

97 (?P<space_after_value> \s* ) # We can have optional space after the value 

98 )? 

99""", 

100 re.VERBOSE, 

101) 

102 

103 

104class Deb822Token(Locatable): 

105 """A token is an atomic syntactical element from a deb822 file 

106 

107 A file is parsed into a series of tokens. If these tokens are converted to 

108 text in exactly the same order, you get exactly the same file - bit-for-bit. 

109 Accordingly ever bit of text in a file must be assigned to exactly one 

110 Deb822Token. 

111 """ 

112 

113 __slots__ = ("_text", "_parent_element", "_token_size", "__weakref__") 

114 

115 def __init__(self, text): 

116 # type: (str) -> None 

117 if text == "": # pragma: no cover 

118 raise ValueError("Tokens must have content") 

119 self._text = text # type: str 

120 self._parent_element = None # type: Optional[ReferenceType['Deb822Element']] 

121 self._token_size = None # type: Optional[Range] 

122 self._verify_token_text() 

123 

124 def __repr__(self): 

125 # type: () -> str 

126 return "{clsname}('{text}')".format( 

127 clsname=self.__class__.__name__, text=self._text.replace("\n", "\\n") 

128 ) 

129 

130 def _verify_token_text(self): 

131 # type: () -> None 

132 if "\n" in self._text: 

133 is_single_line_token = False 

134 if self.is_comment or self.is_error: 

135 is_single_line_token = True 

136 if not is_single_line_token and not self.is_whitespace: 136 ↛ 137line 136 didn't jump to line 137, because the condition on line 136 was never true

137 raise ValueError( 

138 "Only whitespace, error and comment tokens may contain newlines" 

139 ) 

140 if not self.text.endswith("\n"): 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 raise ValueError("Tokens containing whitespace must end on a newline") 

142 if is_single_line_token and "\n" in self.text[:-1]: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 raise ValueError( 

144 "Comments and error tokens must not contain embedded newlines" 

145 " (only end on one)" 

146 ) 

147 

148 @property 

149 def is_whitespace(self): 

150 # type: () -> bool 

151 return False 

152 

153 @property 

154 def is_comment(self): 

155 # type: () -> bool 

156 return False 

157 

158 @property 

159 def is_error(self): 

160 # type: () -> bool 

161 return False 

162 

163 @property 

164 def text(self): 

165 # type: () -> str 

166 return self._text 

167 

168 # To support callers that want a simple interface for converting tokens and elements to text 

169 def convert_to_text(self): 

170 # type: () -> str 

171 return self._text 

172 

173 def size(self, *, skip_leading_comments: bool = False) -> Range: 

174 # As tokens are an atomic unit 

175 token_size = self._token_size 

176 if token_size is not None: 

177 return token_size 

178 token_len = len(self._text) 

179 if token_len == 1: 

180 # The indirection with `r` because mypy gets confused and thinks that `token_size` 

181 # cannot have any type at all. 

182 token_size = ONE_CHAR_RANGE if self._text != "\n" else ONE_LINE_RANGE 

183 else: 

184 new_lines = self._text.count("\n") 

185 assert not new_lines or self._text[-1] == "\n" 

186 end_pos = Position(new_lines, 0) if new_lines else Position(0, token_len) 

187 token_size = Range(START_POSITION, end_pos) 

188 self._token_size = token_size 

189 return token_size 

190 

191 @property 

192 def parent_element(self): 

193 # type: () -> Optional[Deb822Element] 

194 return resolve_ref(self._parent_element) 

195 

196 @parent_element.setter 

197 def parent_element(self, new_parent): 

198 # type: (Optional[Deb822Element]) -> None 

199 self._parent_element = ( 

200 weakref.ref(new_parent) if new_parent is not None else None 

201 ) 

202 

203 def clear_parent_if_parent(self, parent): 

204 # type: (Deb822Element) -> None 

205 if parent is self.parent_element: 

206 self._parent_element = None 

207 

208 

209class Deb822WhitespaceToken(Deb822Token): 

210 """The token is a kind of whitespace. 

211 

212 Some whitespace tokens are critical for the format (such as the Deb822ValueContinuationToken, 

213 spaces that separate words in list separated by spaces or newlines), while other whitespace 

214 tokens are truly insignificant (space before a newline, space after a comma in a comma 

215 list, etc.). 

216 """ 

217 

218 __slots__ = () 

219 

220 @property 

221 def is_whitespace(self): 

222 # type: () -> bool 

223 return True 

224 

225 

226class Deb822SemanticallySignificantWhiteSpace(Deb822WhitespaceToken): 

227 """Whitespace that (if removed) would change the meaning of the file (or cause syntax errors)""" 

228 

229 __slots__ = () 

230 

231 

232class Deb822NewlineAfterValueToken(Deb822SemanticallySignificantWhiteSpace): 

233 """The newline after a value token. 

234 

235 If not followed by a continuation token, this also marks the end of the field. 

236 """ 

237 

238 __slots__ = () 

239 

240 def __init__(self): 

241 # type: () -> None 

242 super().__init__("\n") 

243 

244 

245class Deb822ValueContinuationToken(Deb822SemanticallySignificantWhiteSpace): 

246 """The whitespace denoting a value spanning an additional line (the first space on a line)""" 

247 

248 __slots__ = () 

249 

250 

251class Deb822SpaceSeparatorToken(Deb822SemanticallySignificantWhiteSpace): 

252 """Whitespace between values in a space list (e.g. "Architectures")""" 

253 

254 __slots__ = () 

255 

256 

257class Deb822ErrorToken(Deb822Token): 

258 """Token that represents a syntactical error""" 

259 

260 __slots__ = () 

261 

262 @property 

263 def is_error(self): 

264 # type: () -> bool 

265 return True 

266 

267 

268class Deb822CommentToken(Deb822Token): 

269 

270 __slots__ = () 

271 

272 @property 

273 def is_comment(self): 

274 # type: () -> bool 

275 return True 

276 

277 

278class Deb822FieldNameToken(Deb822Token): 

279 

280 __slots__ = () 

281 

282 def __init__(self, text): 

283 # type: (str) -> None 

284 if not isinstance(text, _strI): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true

285 text = _strI(sys.intern(text)) 

286 super().__init__(text) 

287 

288 @property 

289 def text(self): 

290 # type: () -> _strI 

291 return cast("_strI", self._text) 

292 

293 

294# The colon after the field name, parenthesis, etc. 

295class Deb822SeparatorToken(Deb822Token): 

296 

297 __slots__ = () 

298 

299 

300class Deb822FieldSeparatorToken(Deb822Token): 

301 

302 __slots__ = () 

303 

304 def __init__(self): 

305 # type: () -> None 

306 super().__init__(":") 

307 

308 

309class Deb822CommaToken(Deb822SeparatorToken): 

310 """Used by the comma-separated list value parsers to denote a comma between two value tokens.""" 

311 

312 __slots__ = () 

313 

314 def __init__(self): 

315 # type: () -> None 

316 super().__init__(",") 

317 

318 

319class Deb822PipeToken(Deb822SeparatorToken): 

320 """Used in some dependency fields as OR relation""" 

321 

322 __slots__ = () 

323 

324 def __init__(self): 

325 # type: () -> None 

326 super().__init__("|") 

327 

328 

329class Deb822ValueToken(Deb822Token): 

330 """A field value can be split into multi "Deb822ValueToken"s (as well as separator tokens)""" 

331 

332 __slots__ = () 

333 

334 

335class Deb822ValueDependencyToken(Deb822Token): 

336 """Package name, architecture name, a version number, or a profile name in a dependency field""" 

337 

338 __slots__ = () 

339 

340 

341class Deb822ValueDependencyVersionRelationOperatorToken(Deb822Token): 

342 

343 __slots__ = () 

344 

345 

346def tokenize_deb822_file(sequence, encoding="utf-8"): 

347 # type: (Iterable[Union[str, bytes]], str) -> Iterable[Deb822Token] 

348 """Tokenize a deb822 file 

349 

350 :param sequence: An iterable of lines (a file open for reading will do) 

351 :param encoding: The encoding to use (this is here to support Deb822-like 

352 APIs, new code should not use this parameter). 

353 """ 

354 current_field_name = None 

355 field_name_cache = {} # type: Dict[str, _strI] 

356 

357 def _normalize_input(s): 

358 # type: (Iterable[Union[str, bytes]]) -> Iterable[str] 

359 for x in s: 

360 if isinstance(x, bytes): 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true

361 x = x.decode(encoding) 

362 if not x.endswith("\n"): 

363 # We always end on a newline because it makes a lot of code simpler. The pain 

364 # points relates to mutations that add content after the last field. Sadly, these 

365 # mutations can happen via adding fields, reordering fields, etc. and are too hard 

366 # to track to make it worth it to support the special case that makes up missing 

367 # a newline at the end of the file. 

368 x += "\n" 

369 yield x 

370 

371 text_stream = BufferingIterator( 

372 _normalize_input(sequence) 

373 ) # type: BufferingIterator[str] 

374 

375 for line in text_stream: 

376 if line.isspace(): 

377 if current_field_name: 

378 # Blank lines terminate fields 

379 current_field_name = None 

380 

381 # If there are multiple whitespace-only lines, we combine them 

382 # into one token. 

383 r = list(text_stream.takewhile(str.isspace)) 

384 if r: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true

385 line += "".join(r) 

386 

387 # whitespace tokens are likely to have duplicate cases (like 

388 # single newline tokens), so we intern the strings there. 

389 yield Deb822WhitespaceToken(sys.intern(line)) 

390 continue 

391 

392 if line[0] == "#": 

393 yield Deb822CommentToken(line) 

394 continue 

395 

396 if line[0] in (" ", "\t"): 

397 if current_field_name is not None: 397 ↛ 407line 397 didn't jump to line 407, because the condition on line 397 was never false

398 # We emit a separate whitespace token for the newline as it makes some 

399 # things easier later (see _build_value_line) 

400 leading = sys.intern(line[0]) 

401 # Pull out the leading space and newline 

402 line = line[1:-1] 

403 yield Deb822ValueContinuationToken(leading) 

404 yield Deb822ValueToken(line) 

405 yield Deb822NewlineAfterValueToken() 

406 else: 

407 yield Deb822ErrorToken(line) 

408 continue 

409 

410 field_line_match = _RE_FIELD_LINE.match(line) 

411 if field_line_match: 411 ↛ 455line 411 didn't jump to line 455, because the condition on line 411 was never false

412 # The line is a field, which means there is a bit to unpack 

413 # - note that by definition, leading and trailing whitespace is insignificant 

414 # on the value part directly after the field separator 

415 (field_name, _, space_before, value, space_after) = ( 

416 field_line_match.groups() 

417 ) 

418 

419 current_field_name = field_name_cache.get(field_name) 

420 

421 if value is None or value == "": 421 ↛ 424line 421 didn't jump to line 424

422 # If there is no value, then merge the two space elements into space_after 

423 # as it makes it easier to handle the newline. 

424 space_after = ( 

425 space_before + space_after if space_after else space_before 

426 ) 

427 space_before = "" 

428 

429 if space_after: 429 ↛ 435line 429 didn't jump to line 435, because the condition on line 429 was never false

430 # We emit a separate whitespace token for the newline as it makes some 

431 # things easier later (see _build_value_line) 

432 if space_after.endswith("\n"): 432 ↛ 435line 432 didn't jump to line 435, because the condition on line 432 was never false

433 space_after = space_after[:-1] 

434 

435 if current_field_name is None: 435 ↛ 443line 435 didn't jump to line 443, because the condition on line 435 was never false

436 field_name = sys.intern(field_name) 

437 current_field_name = _strI(field_name) 

438 field_name_cache[field_name] = current_field_name 

439 

440 # We use current_field_name from here as it is a _strI. 

441 # Delete field_name to avoid accidentally using it and getting bugs 

442 # that should not happen. 

443 del field_name 

444 

445 yield Deb822FieldNameToken(current_field_name) 

446 yield Deb822FieldSeparatorToken() 

447 if space_before: 

448 yield Deb822WhitespaceToken(sys.intern(space_before)) 

449 if value: 449 ↛ 451line 449 didn't jump to line 451, because the condition on line 449 was never false

450 yield Deb822ValueToken(value) 

451 if space_after: 

452 yield Deb822WhitespaceToken(sys.intern(space_after)) 

453 yield Deb822NewlineAfterValueToken() 

454 else: 

455 yield Deb822ErrorToken(line) 

456 

457 

458def _value_line_tokenizer(func): 

459 # type: (Callable[[str], Iterable[Deb822Token]]) -> (Callable[[str], Iterable[Deb822Token]]) 

460 def impl(v): 

461 # type: (str) -> Iterable[Deb822Token] 

462 first_line = True 

463 for no, line in enumerate(v.splitlines(keepends=True)): 

464 assert not v.isspace() or no == 0 

465 if line.startswith("#"): 

466 yield Deb822CommentToken(line) 

467 continue 

468 has_newline = False 

469 continuation_line_marker = None 

470 if not first_line: 

471 continuation_line_marker = line[0] 

472 line = line[1:] 

473 first_line = False 

474 if line.endswith("\n"): 

475 has_newline = True 

476 line = line[:-1] 

477 if continuation_line_marker is not None: 

478 yield Deb822ValueContinuationToken(sys.intern(continuation_line_marker)) 

479 yield from func(line) 

480 if has_newline: 

481 yield Deb822NewlineAfterValueToken() 

482 

483 return impl 

484 

485 

486@_value_line_tokenizer 

487def whitespace_split_tokenizer(v): 

488 # type: (str) -> Iterable[Deb822Token] 

489 assert "\n" not in v 

490 for match in _RE_WHITESPACE_SEPARATED_WORD_LIST.finditer(v): 

491 space_before, word, space_after = match.groups() 

492 if space_before: 

493 yield Deb822SpaceSeparatorToken(sys.intern(space_before)) 

494 yield Deb822ValueToken(word) 

495 if space_after: 

496 yield Deb822SpaceSeparatorToken(sys.intern(space_after)) 

497 

498 

499@_value_line_tokenizer 

500def comma_split_tokenizer(v): 

501 # type: (str) -> Iterable[Deb822Token] 

502 assert "\n" not in v 

503 for match in _RE_COMMA_SEPARATED_WORD_LIST.finditer(v): 

504 space_before_comma, comma, space_before_word, word, space_after_word = ( 

505 match.groups() 

506 ) 

507 if space_before_comma: 

508 yield Deb822WhitespaceToken(sys.intern(space_before_comma)) 

509 if comma: 

510 yield Deb822CommaToken() 

511 if space_before_word: 

512 yield Deb822WhitespaceToken(sys.intern(space_before_word)) 

513 if word: 

514 yield Deb822ValueToken(word) 

515 if space_after_word: 

516 yield Deb822WhitespaceToken(sys.intern(space_after_word))