Coverage for src/debputy/lsp/vendoring/_deb822_repro/formatter.py: 80%

128 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-04-07 12:14 +0200

1import operator 

2 

3from ._util import BufferingIterator 

4from .tokens import Deb822Token 

5 

6# Consider these "opaque" enum-like values. The actual value was chosen to 

7# make repr easier to implement, but they are subject to change. 

8_CONTENT_TYPE_VALUE = "is_value" 

9_CONTENT_TYPE_COMMENT = "is_comment" 

10_CONTENT_TYPE_SEPARATOR = "is_separator" 

11 

12try: 

13 from typing import Iterator, Union, Literal 

14 from .types import TokenOrElement, FormatterCallback 

15except ImportError: 

16 pass 

17 

18 

19class FormatterContentToken(object): 

20 """Typed, tagged text for use with the formatting API 

21 

22 The FormatterContentToken is used by the formatting API and provides the 

23 formatter callback with context about the textual tokens it is supposed 

24 to format. 

25 """ 

26 

27 __slots__ = ("_text", "_content_type") 

28 

29 def __init__(self, text, content_type): 

30 # type: (str, object) -> None 

31 self._text = text 

32 self._content_type = content_type 

33 

34 @classmethod 

35 def from_token_or_element(cls, token_or_element): 

36 # type: (TokenOrElement) -> FormatterContentToken 

37 if isinstance(token_or_element, Deb822Token): 

38 if token_or_element.is_comment: 

39 return cls.comment_token(token_or_element.text) 

40 if token_or_element.is_whitespace: 

41 raise ValueError("FormatterContentType cannot be whitespace") 

42 return cls.value_token(token_or_element.text) 

43 # Elements are assumed to be content (this is specialized for the 

44 # interpretations where comments are always tokens). 

45 return cls.value_token(token_or_element.convert_to_text()) 

46 

47 @classmethod 

48 def separator_token(cls, text): 

49 # type: (str) -> FormatterContentToken 

50 # Special-case separators as a minor memory optimization 

51 if text == " ": 

52 return SPACE_SEPARATOR_FT 

53 if text == ",": 53 ↛ 54line 53 didn't jump to line 54, because the condition on line 53 was never true

54 return COMMA_SEPARATOR_FT 

55 return cls(text, _CONTENT_TYPE_SEPARATOR) 

56 

57 @classmethod 

58 def comment_token(cls, text): 

59 # type: (str) -> FormatterContentToken 

60 """Generates a single comment token with the provided text 

61 

62 Mostly useful for creating test cases 

63 """ 

64 return cls(text, _CONTENT_TYPE_COMMENT) 

65 

66 @classmethod 

67 def value_token(cls, text): 

68 # type: (str) -> FormatterContentToken 

69 """Generates a single value token with the provided text 

70 

71 Mostly useful for creating test cases 

72 """ 

73 return cls(text, _CONTENT_TYPE_VALUE) 

74 

75 @property 

76 def is_comment(self): 

77 # type: () -> bool 

78 """True if this formatter token represent a comment 

79 

80 This should be used for determining whether the token is a comment 

81 or not. It might be tempting to check whether the text in the token 

82 starts with a "#" but that is insufficient because a value *can* 

83 start with that as well. Whether it is a comment or a value is 

84 based on the context (it is a comment if and only if the "#" was 

85 at the start of a line) but the formatter often do not have the 

86 context available to assert this. 

87 

88 The formatter *should* preserve the order of comments and interleave 

89 between the value tokens in the same order as it see them. Failing 

90 to preserve the order of comments and values can cause confusing 

91 comments (such as associating the comment with a different value 

92 than it was written for). 

93 

94 The formatter *may* discard comment tokens if it does not want to 

95 preserve them. If so, they would be omitted in the output, which 

96 may be acceptable in some cases. This is a lot better than 

97 re-ordering comments. 

98 

99 Formatters must be aware of the following special cases for comments: 

100 * Comments *MUST* be emitted after a newline. If the very first token 

101 is a comment, the formatter is expected to emit a newline before it 

102 as well (Fields cannot start immediately on a comment). 

103 """ 

104 return self._content_type is _CONTENT_TYPE_COMMENT 

105 

106 @property 

107 def is_value(self): 

108 # type: () -> bool 

109 """True if this formatter token represents a semantic value 

110 

111 The formatter *MUST* preserve values as-in in its output. It may 

112 "unpack" it from the token (as in, return it as a part of a plain 

113 str) but the value content must not be changed nor re-ordered relative 

114 to other value tokens (as that could change the meaning of the field). 

115 """ 

116 return self._content_type is _CONTENT_TYPE_VALUE 

117 

118 @property 

119 def is_separator(self): 

120 # type: () -> bool 

121 """True if this formatter token represents a separator token 

122 

123 The formatter is not required to preserve the provided separators but it 

124 is required to properly separate values. In fact, often is a lot easier 

125 to discard existing separator tokens. As an example, in whitespace 

126 separated list of values space, tab and newline all counts as separator. 

127 However, formatting-wise, there is a world of difference between the 

128 a space, tab and a newline. In particularly, newlines must be followed 

129 by an additional space or tab (to act as a value continuation line) if 

130 there is a value following it (otherwise, the generated output is 

131 invalid). 

132 """ 

133 return self._content_type is _CONTENT_TYPE_SEPARATOR 

134 

135 @property 

136 def is_whitespace(self): 

137 # type: () -> bool 

138 """True if this formatter token represents a whitespace token""" 

139 return self._content_type is _CONTENT_TYPE_SEPARATOR and self._text.isspace() 

140 

141 @property 

142 def text(self): 

143 # type: () -> str 

144 """The actual context of the token 

145 

146 This field *must not* be used to determine the type of token. The 

147 formatter cannot reliably tell whether "#..." is a comment or a value 

148 (it can be both). Use is_value and is_comment instead for discriminating 

149 token types. 

150 

151 For value tokens, this the concrete value to be omitted. 

152 

153 For comment token, this is the full comment text. 

154 

155 This is the same as str(token). 

156 """ 

157 return self._text 

158 

159 def __str__(self): 

160 # type: () -> str 

161 return self._text 

162 

163 def __repr__(self): 

164 # type: () -> str 

165 return "{}({!r}, {}=True)".format( 

166 self.__class__.__name__, self._text, self._content_type 

167 ) 

168 

169 

170SPACE_SEPARATOR_FT = FormatterContentToken(" ", _CONTENT_TYPE_SEPARATOR) 

171COMMA_SEPARATOR_FT = FormatterContentToken(",", _CONTENT_TYPE_SEPARATOR) 

172 

173 

174def one_value_per_line_formatter( 

175 indentation, # type: Union[int, Literal["FIELD_NAME_LENGTH"]] 

176 trailing_separator=True, # type: bool 

177 immediate_empty_line=False, # type: bool 

178): 

179 # type: (...) -> FormatterCallback 

180 """Provide a simple formatter that can handle indentation and trailing separators 

181 

182 All formatters returned by this function puts exactly one value per line. This 

183 pattern is commonly seen in the "Depends" field and similar fields of 

184 debian/control files. 

185 

186 :param indentation: Either the literal string "FIELD_NAME_LENGTH" or a positive 

187 integer, which determines the indentation for fields. If it is an integer, 

188 then a fixed indentation is used (notably the value 1 ensures the shortest 

189 possible indentation). Otherwise, if it is "FIELD_NAME_LENGTH", then the 

190 indentation is set such that it aligns the values based on the field name. 

191 :param trailing_separator: If True, then the last value will have a trailing 

192 separator token (e.g., ",") after it. 

193 :param immediate_empty_line: Whether the value should always start with an 

194 empty line. If True, then the result becomes something like "Field:\n value". 

195 

196 """ 

197 if indentation != "FIELD_NAME_LENGTH" and indentation < 1: 197 ↛ 198line 197 didn't jump to line 198, because the condition on line 197 was never true

198 raise ValueError('indentation must be at least 1 (or "FIELD_NAME_LENGTH")') 

199 

200 def _formatter( 

201 name, # type: str 

202 sep_token, # type: FormatterContentToken 

203 formatter_tokens, # type: Iterator[FormatterContentToken] 

204 ): 

205 # type: (...) -> Iterator[Union[FormatterContentToken, str]] 

206 if indentation == "FIELD_NAME_LENGTH": 

207 indent_len = len(name) + 2 

208 else: 

209 indent_len = indentation 

210 indent = " " * indent_len 

211 

212 emitted_first_line = False 

213 tok_iter = BufferingIterator(formatter_tokens) 

214 is_value = operator.attrgetter("is_value") 

215 if immediate_empty_line: 

216 emitted_first_line = True 

217 yield "\n" 

218 for t in tok_iter: 

219 if t.is_comment: 

220 if not emitted_first_line: 

221 yield "\n" 

222 yield t 

223 elif t.is_value: 

224 if not emitted_first_line: 

225 yield " " 

226 else: 

227 yield indent 

228 yield t 

229 if not sep_token.is_whitespace and ( 

230 trailing_separator or tok_iter.peek_find(is_value) 

231 ): 

232 yield sep_token 

233 yield "\n" 

234 else: 

235 # Skip existing separators (etc.) 

236 continue 

237 emitted_first_line = True 

238 

239 return _formatter 

240 

241 

242one_value_per_line_trailing_separator = one_value_per_line_formatter( 

243 "FIELD_NAME_LENGTH", trailing_separator=True 

244) 

245 

246 

247def format_field( 

248 formatter, # type: FormatterCallback 

249 field_name, # type: str 

250 separator_token, # type: FormatterContentToken 

251 token_iter, # type: Iterator[FormatterContentToken] 

252): 

253 # type: (...) -> str 

254 """Format a field using a provided formatter 

255 

256 This function formats a series of tokens using the provided formatter. 

257 It can be used as a standalone formatter engine and can be used in test 

258 suites to validate third-party formatters (enabling them to test for 

259 corner cases without involving parsing logic). 

260 

261 The formatter receives series of FormatterContentTokens (via the 

262 token_iter) and is expected to yield one or more str or 

263 FormatterContentTokens. The calling function will combine all of 

264 these into a single string, which will be used as the value. 

265 

266 The formatter is recommended to yield the provided value and comment 

267 tokens interleaved with text segments of whitespace and separators 

268 as part of its output. If it preserve comment and value tokens, the 

269 calling function can provide some runtime checks to catch bugs 

270 (like the formatter turning a comment into a value because it forgot 

271 to ensure that the comment was emitted directly after a newline 

272 character). 

273 

274 When writing a formatter, please keep the following in mind: 

275 

276 * The output of the formatter is appended directly after the ":" separator. 

277 Most formatters will want to emit either a space or a newline as the very 

278 first character for readability. 

279 (compare "Depends:foo\\n" to "Depends: foo\\n") 

280 

281 * The formatter must always end its output on a newline. This is a design 

282 choice of how the round-trip safe parser represent values that is imposed 

283 on the formatter. 

284 

285 * It is often easier to discard/ignore all separator tokens from the 

286 the provided token sequence and instead just yield separator tokens/str 

287 where the formatter wants to place them. 

288 

289 - The formatter is strongly recommended to special-case formatting 

290 for whitespace separators (check for `separator_token.is_whitespace`). 

291 

292 This is because space, tab and newline all counts as valid separators 

293 and can all appear in the token sequence. If the original field uses 

294 a mix of these separators it is likely to completely undermine the 

295 desired result. Not to mention the additional complexity of handling 

296 when a separator token happens to use the newline character which 

297 affects how the formatter is supposed what comes after it 

298 (see the rules for comments, empty lines and continuation line 

299 markers). 

300 

301 * The formatter must remember to emit a "continuation line" marker 

302 (typically a single space or tab) when emitting a value after 

303 a newline or a comment. A `yield " "` is sufficient. 

304 

305 - The continuation line marker may be embedded inside a str 

306 with other whitespace (such as the newline coming before it 

307 or/and whitespace used for indentation purposes following 

308 the marker). 

309 

310 * The formatter must not cause the output to contain completely 

311 empty/whitespace lines as these cause syntax errors. The first 

312 line never counts as an empty line (as it will be appended after 

313 the field name). 

314 

315 * Tokens must be discriminated via the `token.is_value` (etc.) 

316 properties. Assuming that `token.text.startswith("#")` implies a 

317 comment and similar stunts are wrong. As an example, "#foo" is a 

318 perfectly valid value in some contexts. 

319 

320 * Comment tokens *always* take up exactly one complete line including 

321 the newline character at the end of the line. They must be emitted 

322 directly after a newline character or another comment token. 

323 

324 * Special cases that are rare but can happen: 

325 

326 - Fields *can* start with comments and requires a formatter provided newline. 

327 (Example: "Depends:\\n# Comment here\\n foo") 

328 

329 - Fields *can* start on a separator or have two separators in a row. 

330 This is especially true for whitespace separated fields where every 

331 whitespace counts as a separator, but it can also happen with other 

332 separators (such as comma). 

333 

334 - Value tokens can contain whitespace (for non-whitespace separators). 

335 When they do, the formatter must not attempt change nor "normalize" 

336 the whitespace inside the value token as that might change how the 

337 value is interpreted. (If you want to normalize such whitespace, 

338 the formatter is at the wrong abstraction level. Instead, manipulate 

339 the values directly in the value interpretation layer) 

340 

341 This function will provide *some* runtime checks of its input and the 

342 output from the formatter to detect some errors early and provide 

343 helpful diagnostics. If you use the function for testing, you are 

344 recommended to rely on verifying the output of the function rather than 

345 relying on the runtime checks (as these are subject to change). 

346 

347 :param formatter: A formatter (see FormatterCallback for the type). 

348 Basic formatting is provided via one_value_per_line_trailing_separator 

349 (a formatter) or one_value_per_line_formatter (a formatter generator). 

350 :param field_name: The name of the field. 

351 :param separator_token: One of SPACE_SEPARATOR and COMMA_SEPARATOR 

352 :param token_iter: An iterable of tokens to be formatted. 

353 

354 The following example shows how to define a formatter_callback along with 

355 a few verifications. 

356 

357 >>> fmt_field_len_sep = one_value_per_line_trailing_separator 

358 >>> fmt_shortest = one_value_per_line_formatter( 

359 ... 1, 

360 ... trailing_separator=False 

361 ... ) 

362 >>> fmt_newline_first = one_value_per_line_formatter( 

363 ... 1, 

364 ... trailing_separator=False, 

365 ... immediate_empty_line=True 

366 ... ) 

367 >>> # Omit separator tokens for in the token list for simplicity (the formatter does 

368 >>> # not use them, and it enables us to keep the example simple by reusing the list) 

369 >>> tokens = [ 

370 ... FormatterContentToken.value_token("foo"), 

371 ... FormatterContentToken.comment_token("# some comment about bar\\n"), 

372 ... FormatterContentToken.value_token("bar"), 

373 ... ] 

374 >>> # Starting with fmt_dl_ts 

375 >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens), end='') 

376 Depends: foo, 

377 # some comment about bar 

378 bar, 

379 >>> print(format_field(fmt_field_len_sep, "Architecture", SPACE_SEPARATOR_FT, tokens), end='') 

380 Architecture: foo 

381 # some comment about bar 

382 bar 

383 >>> # Control check for the special case where the field starts with a comment 

384 >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='') 

385 Depends: 

386 # some comment about bar 

387 bar, 

388 >>> # Also, check single line values (to ensure it ends on a newline) 

389 >>> print(format_field(fmt_field_len_sep, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='') 

390 Depends: bar, 

391 >>> ### Changing format to the shortest length 

392 >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens), end='') 

393 Depends: foo, 

394 # some comment about bar 

395 bar 

396 >>> print(format_field(fmt_shortest, "Architecture", SPACE_SEPARATOR_FT, tokens), end='') 

397 Architecture: foo 

398 # some comment about bar 

399 bar 

400 >>> # Control check for the special case where the field starts with a comment 

401 >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='') 

402 Depends: 

403 # some comment about bar 

404 bar 

405 >>> # Also, check single line values (to ensure it ends on a newline) 

406 >>> print(format_field(fmt_shortest, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='') 

407 Depends: bar 

408 >>> ### Changing format to the newline first format 

409 >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens), end='') 

410 Depends: 

411 foo, 

412 # some comment about bar 

413 bar 

414 >>> print(format_field(fmt_newline_first, "Architecture", SPACE_SEPARATOR_FT, tokens), end='') 

415 Architecture: 

416 foo 

417 # some comment about bar 

418 bar 

419 >>> # Control check for the special case where the field starts with a comment 

420 >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens[1:]), end='') 

421 Depends: 

422 # some comment about bar 

423 bar 

424 >>> # Also, check single line values (to ensure it ends on a newline) 

425 >>> print(format_field(fmt_newline_first, "Depends", COMMA_SEPARATOR_FT, tokens[2:]), end='') 

426 Depends: 

427 bar 

428 """ 

429 formatted_tokens = [field_name, ":"] 

430 just_after_newline = False 

431 last_was_value_token = False 

432 if isinstance(token_iter, list): 

433 # Stop people from using this to test known "invalid" cases. 

434 last_token = token_iter[-1] 

435 if last_token.is_comment: 435 ↛ 436line 435 didn't jump to line 436, because the condition on line 435 was never true

436 raise ValueError( 

437 "Invalid token_iter: Field values cannot end with comments" 

438 ) 

439 for token in formatter(field_name, separator_token, token_iter): 

440 token_as_text = str(token) 

441 # If we are given formatter tokens, then use them to verify the output. 

442 if isinstance(token, FormatterContentToken): 

443 if token.is_comment: 

444 if not just_after_newline: 444 ↛ 445line 444 didn't jump to line 445, because the condition on line 444 was never true

445 raise ValueError( 

446 "Bad format: Comments must appear directly after a newline." 

447 ) 

448 # for the sake of ensuring people use proper test data. 

449 if not token_as_text.startswith("#"): 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true

450 raise ValueError("Invalid Comment token: Must start with #") 

451 if not token_as_text.endswith("\n"): 451 ↛ 452line 451 didn't jump to line 452, because the condition on line 451 was never true

452 raise ValueError("Invalid Comment token: Must end on a newline") 

453 elif token.is_value: 

454 if token_as_text[0].isspace() or token_as_text[-1].isspace(): 454 ↛ 455line 454 didn't jump to line 455, because the condition on line 454 was never true

455 raise ValueError( 

456 "Invalid Value token: It cannot start nor end on whitespace" 

457 ) 

458 if just_after_newline: 458 ↛ 459line 458 didn't jump to line 459, because the condition on line 458 was never true

459 raise ValueError("Bad format: Missing continuation line marker") 

460 if last_was_value_token: 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true

461 raise ValueError("Bad format: Formatter omitted a separator") 

462 

463 last_was_value_token = token.is_value 

464 else: 

465 last_was_value_token = False 

466 

467 if just_after_newline: 

468 if token_as_text[0] in ("\r", "\n"): 468 ↛ 469line 468 didn't jump to line 469, because the condition on line 468 was never true

469 raise ValueError("Bad format: Saw completely empty line.") 

470 if not token_as_text[0].isspace() and not token_as_text.startswith("#"): 470 ↛ 471line 470 didn't jump to line 471, because the condition on line 470 was never true

471 raise ValueError("Bad format: Saw completely empty line.") 

472 formatted_tokens.append(token_as_text) 

473 just_after_newline = token_as_text.endswith("\n") 

474 

475 formatted_text = "".join(formatted_tokens) 

476 if not formatted_text.endswith("\n"): 476 ↛ 477line 476 didn't jump to line 477, because the condition on line 476 was never true

477 raise ValueError("Bad format: The field value must end on a newline") 

478 return formatted_text