Coverage for src/debputy/lsp/vendoring/_deb822

1import re

2import sys

3import weakref

4from weakref import ReferenceType

6from ._util import BufferingIterator

7from .locatable import (

8 Locatable,

9 START_POSITION,

10 Range,

11 ONE_CHAR_RANGE,

12 ONE_LINE_RANGE,

13 Position,

14)

15from debian._util import resolve_ref, _strI

17try:

18 from typing import Optional, cast, TYPE_CHECKING, Iterable, Union, Dict, Callable

19except ImportError:

20 # pylint: disable=unnecessary-lambda-assignment

21 TYPE_CHECKING = False

22 cast = lambda t, v: v

24if TYPE_CHECKING:

25 from .parsing import Deb822Element

28# Consume whitespace and a single word.

29_RE_WHITESPACE_SEPARATED_WORD_LIST = re.compile(

30 r"""

31 (?P<space_before>\s*) # Consume any whitespace before the word

32 # The space only occurs in practise if the line starts

33 # with space.

35 # Optionally consume a word (needed to handle the case

36 # when there are no words left and someone applies this

37 # pattern to the remaining text). This is mostly here as

38 # a fail-safe.

40 (?P<word>\S+) # Consume the word (if present)

41 (?P<trailing_whitespace>\s*) # Consume trailing whitespace

42""",

43 re.VERBOSE,

44)

45_RE_COMMA_SEPARATED_WORD_LIST = re.compile(

46 r"""

47 # This regex is slightly complicated by the fact that it should work with

48 # finditer and consume the entire value.

49 #

50 # To do this, we structure the regex so it always starts on a comma (except

51 # for the first iteration, where we permit the absence of a comma)

53 (?: # Optional space followed by a mandatory comma unless

54 # it is the start of the "line" (in which case, we

55 # allow the comma to be omitted)

56 ^

57 |

58 (?:

59 (?P<space_before_comma>\s*) # This space only occurs in practise if the line

60 # starts with space + comma.

61 (?P<comma> ,)

62 )

63 )

65 # From here it is "optional space, maybe a word and then optional space" again. One reason why

66 # all of it is optional is to gracefully cope with trailing commas.

67 (?P<space_before_word>\s*)

68 (?P<word> [^,\s] (?: [^,]*[^,\s])? )? # "Words" can contain spaces for comma separated list.

69 # But surrounding whitespace is ignored

70 (?P<space_after_word>\s*)

71""",

72 re.VERBOSE,

73)

75# From Policy 5.1:

76#

77# The field name is composed of US-ASCII characters excluding control

78# characters, space, and colon (i.e., characters in the ranges U+0021

79# (!) through U+0039 (9), and U+003B (;) through U+007E (~),

80# inclusive). Field names must not begin with the comment character

81# (U+0023 #), nor with the hyphen character (U+002D -).

82#

83# That combines to this regex of questionable readability

84_RE_FIELD_LINE = re.compile(

85 r"""

86 ^ # Start of line

87 (?P<field_name> # Capture group for the field name

88 [\x21\x22\x24-\x2C\x2F-\x39\x3B-\x7F] # First character

89 [\x21-\x39\x3B-\x7F]* # Subsequent characters (if any)

90 )

91 (?P<separator> : )

92 (?P<space_before_value> \s* )

93 (?: # Field values are not mandatory on the same line

94 # as the field name.

96 (?P<value> \S(?:.*\S)? ) # Values must start and end on a "non-space"

97 (?P<space_after_value> \s* ) # We can have optional space after the value

98 )?

99""",

100 re.VERBOSE,

101)

102

103

104class Deb822Token(Locatable):

105 """A token is an atomic syntactical element from a deb822 file

106

107 A file is parsed into a series of tokens. If these tokens are converted to

108 text in exactly the same order, you get exactly the same file - bit-for-bit.

109 Accordingly ever bit of text in a file must be assigned to exactly one

110 Deb822Token.

111 """

112

113 __slots__ = ("_text", "_parent_element", "_token_size", "__weakref__")

114

115 def __init__(self, text):

116 # type: (str) -> None

117 if text == "": # pragma: no cover

118 raise ValueError("Tokens must have content")

119 self._text = text # type: str

120 self._parent_element = None # type: Optional[ReferenceType['Deb822Element']]

121 self._token_size = None # type: Optional[Range]

122 self._verify_token_text()

123

124 def __repr__(self):

125 # type: () -> str

126 return "{clsname}('{text}')".format(

127 clsname=self.__class__.__name__, text=self._text.replace("\n", "\\n")

128 )

129

130 def _verify_token_text(self):

131 # type: () -> None

132 if "\n" in self._text:

133 is_single_line_token = False

134 if self.is_comment or self.is_error:

135 is_single_line_token = True

136 if not is_single_line_token and not self.is_whitespace: 136 ↛ 137line 136 didn't jump to line 137, because the condition on line 136 was never true

137 raise ValueError(

138 "Only whitespace, error and comment tokens may contain newlines"

139 )

140 if not self.text.endswith("\n"): 140 ↛ 141line 140 didn't jump to line 141, because the condition on line 140 was never true

141 raise ValueError("Tokens containing whitespace must end on a newline")

142 if is_single_line_token and "\n" in self.text[:-1]: 142 ↛ 143line 142 didn't jump to line 143, because the condition on line 142 was never true

143 raise ValueError(

144 "Comments and error tokens must not contain embedded newlines"

145 " (only end on one)"

146 )

147

148 @property

149 def is_whitespace(self):

150 # type: () -> bool

151 return False

152

153 @property

154 def is_comment(self):

155 # type: () -> bool

156 return False

157

158 @property

159 def is_error(self):

160 # type: () -> bool

161 return False

162

163 @property

164 def text(self):

165 # type: () -> str

166 return self._text

167

168 # To support callers that want a simple interface for converting tokens and elements to text

169 def convert_to_text(self):

170 # type: () -> str

171 return self._text

172

173 def size(self, *, skip_leading_comments: bool = False) -> Range:

174 # As tokens are an atomic unit

175 token_size = self._token_size

176 if token_size is not None:

177 return token_size

178 token_len = len(self._text)

179 if token_len == 1:

180 # The indirection with `r` because mypy gets confused and thinks that `token_size`

181 # cannot have any type at all.

182 token_size = ONE_CHAR_RANGE if self._text != "\n" else ONE_LINE_RANGE

183 else:

184 new_lines = self._text.count("\n")

185 assert not new_lines or self._text[-1] == "\n"

186 end_pos = Position(new_lines, 0) if new_lines else Position(0, token_len)

187 token_size = Range(START_POSITION, end_pos)

188 self._token_size = token_size

189 return token_size

190

191 @property

192 def parent_element(self):

193 # type: () -> Optional[Deb822Element]

194 return resolve_ref(self._parent_element)

195

196 @parent_element.setter

197 def parent_element(self, new_parent):

198 # type: (Optional[Deb822Element]) -> None

199 self._parent_element = (

200 weakref.ref(new_parent) if new_parent is not None else None

201 )

202

203 def clear_parent_if_parent(self, parent):

204 # type: (Deb822Element) -> None

205 if parent is self.parent_element:

206 self._parent_element = None

207

208

209class Deb822WhitespaceToken(Deb822Token):

210 """The token is a kind of whitespace.

211

212 Some whitespace tokens are critical for the format (such as the Deb822ValueContinuationToken,

213 spaces that separate words in list separated by spaces or newlines), while other whitespace

214 tokens are truly insignificant (space before a newline, space after a comma in a comma

215 list, etc.).

216 """

217

218 __slots__ = ()

219

220 @property

221 def is_whitespace(self):

222 # type: () -> bool

223 return True

224

225

226class Deb822SemanticallySignificantWhiteSpace(Deb822WhitespaceToken):

227 """Whitespace that (if removed) would change the meaning of the file (or cause syntax errors)"""

228

229 __slots__ = ()

230

231

232class Deb822NewlineAfterValueToken(Deb822SemanticallySignificantWhiteSpace):

233 """The newline after a value token.

234

235 If not followed by a continuation token, this also marks the end of the field.

236 """

237

238 __slots__ = ()

239

240 def __init__(self):

241 # type: () -> None

242 super().__init__("\n")

243

244

245class Deb822ValueContinuationToken(Deb822SemanticallySignificantWhiteSpace):

246 """The whitespace denoting a value spanning an additional line (the first space on a line)"""

247

248 __slots__ = ()

249

250

251class Deb822SpaceSeparatorToken(Deb822SemanticallySignificantWhiteSpace):

252 """Whitespace between values in a space list (e.g. "Architectures")"""

253

254 __slots__ = ()

255

256

257class Deb822ErrorToken(Deb822Token):

258 """Token that represents a syntactical error"""

259

260 __slots__ = ()

261

262 @property

263 def is_error(self):

264 # type: () -> bool

265 return True

266

267

268class Deb822CommentToken(Deb822Token):

269

270 __slots__ = ()

271

272 @property

273 def is_comment(self):

274 # type: () -> bool

275 return True

276

277

278class Deb822FieldNameToken(Deb822Token):

279

280 __slots__ = ()

281

282 def __init__(self, text):

283 # type: (str) -> None

284 if not isinstance(text, _strI): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true

285 text = _strI(sys.intern(text))

286 super().__init__(text)

287

288 @property

289 def text(self):

290 # type: () -> _strI

291 return cast("_strI", self._text)

292

293

294# The colon after the field name, parenthesis, etc.

295class Deb822SeparatorToken(Deb822Token):

296

297 __slots__ = ()

298

299

300class Deb822FieldSeparatorToken(Deb822Token):

301

302 __slots__ = ()

303

304 def __init__(self):

305 # type: () -> None

306 super().__init__(":")

307

308

309class Deb822CommaToken(Deb822SeparatorToken):

310 """Used by the comma-separated list value parsers to denote a comma between two value tokens."""

311

312 __slots__ = ()

313

314 def __init__(self):

315 # type: () -> None

316 super().__init__(",")

317

318

319class Deb822PipeToken(Deb822SeparatorToken):

320 """Used in some dependency fields as OR relation"""

321

322 __slots__ = ()

323

324 def __init__(self):

325 # type: () -> None

326 super().__init__("|")

327

328

329class Deb822ValueToken(Deb822Token):

330 """A field value can be split into multi "Deb822ValueToken"s (as well as separator tokens)"""

331

332 __slots__ = ()

333

334

335class Deb822ValueDependencyToken(Deb822Token):

336 """Package name, architecture name, a version number, or a profile name in a dependency field"""

337

338 __slots__ = ()

339

340

341class Deb822ValueDependencyVersionRelationOperatorToken(Deb822Token):

342

343 __slots__ = ()

344

345

346def tokenize_deb822_file(sequence, encoding="utf-8"):

347 # type: (Iterable[Union[str, bytes]], str) -> Iterable[Deb822Token]

348 """Tokenize a deb822 file

349

350 :param sequence: An iterable of lines (a file open for reading will do)

351 :param encoding: The encoding to use (this is here to support Deb822-like

352 APIs, new code should not use this parameter).

353 """

354 current_field_name = None

355 field_name_cache = {} # type: Dict[str, _strI]

356

357 def _normalize_input(s):

358 # type: (Iterable[Union[str, bytes]]) -> Iterable[str]

359 for x in s:

360 if isinstance(x, bytes): 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true

361 x = x.decode(encoding)

362 if not x.endswith("\n"):

363 # We always end on a newline because it makes a lot of code simpler. The pain

364 # points relates to mutations that add content after the last field. Sadly, these

365 # mutations can happen via adding fields, reordering fields, etc. and are too hard

366 # to track to make it worth it to support the special case that makes up missing

367 # a newline at the end of the file.

368 x += "\n"

369 yield x

370

371 text_stream = BufferingIterator(

372 _normalize_input(sequence)

373 ) # type: BufferingIterator[str]

374

375 for line in text_stream:

376 if line.isspace():

377 if current_field_name:

378 # Blank lines terminate fields

379 current_field_name = None

380

381 # If there are multiple whitespace-only lines, we combine them

382 # into one token.

383 r = list(text_stream.takewhile(str.isspace))

384 if r: 384 ↛ 385line 384 didn't jump to line 385, because the condition on line 384 was never true

385 line += "".join(r)

386

387 # whitespace tokens are likely to have duplicate cases (like

388 # single newline tokens), so we intern the strings there.

389 yield Deb822WhitespaceToken(sys.intern(line))

390 continue

391

392 if line[0] == "#":

393 yield Deb822CommentToken(line)

394 continue

395

396 if line[0] in (" ", "\t"):

397 if current_field_name is not None: 397 ↛ 407line 397 didn't jump to line 407, because the condition on line 397 was never false

398 # We emit a separate whitespace token for the newline as it makes some

399 # things easier later (see _build_value_line)

400 leading = sys.intern(line[0])

401 # Pull out the leading space and newline

402 line = line[1:-1]

403 yield Deb822ValueContinuationToken(leading)

404 yield Deb822ValueToken(line)

405 yield Deb822NewlineAfterValueToken()

406 else:

407 yield Deb822ErrorToken(line)

408 continue

409

410 field_line_match = _RE_FIELD_LINE.match(line)

411 if field_line_match: 411 ↛ 455line 411 didn't jump to line 455, because the condition on line 411 was never false

412 # The line is a field, which means there is a bit to unpack

413 # - note that by definition, leading and trailing whitespace is insignificant

414 # on the value part directly after the field separator

415 (field_name, _, space_before, value, space_after) = (

416 field_line_match.groups()

417 )

418

419 current_field_name = field_name_cache.get(field_name)

420

421 if value is None or value == "": 421 ↛ 424line 421 didn't jump to line 424

422 # If there is no value, then merge the two space elements into space_after

423 # as it makes it easier to handle the newline.

424 space_after = (

425 space_before + space_after if space_after else space_before

426 )

427 space_before = ""

428

429 if space_after: 429 ↛ 435line 429 didn't jump to line 435, because the condition on line 429 was never false

430 # We emit a separate whitespace token for the newline as it makes some

431 # things easier later (see _build_value_line)

432 if space_after.endswith("\n"): 432 ↛ 435line 432 didn't jump to line 435, because the condition on line 432 was never false

433 space_after = space_after[:-1]

434

435 if current_field_name is None: 435 ↛ 443line 435 didn't jump to line 443, because the condition on line 435 was never false

436 field_name = sys.intern(field_name)

437 current_field_name = _strI(field_name)

438 field_name_cache[field_name] = current_field_name

439

440 # We use current_field_name from here as it is a _strI.

441 # Delete field_name to avoid accidentally using it and getting bugs

442 # that should not happen.

443 del field_name

444

445 yield Deb822FieldNameToken(current_field_name)

446 yield Deb822FieldSeparatorToken()

447 if space_before:

448 yield Deb822WhitespaceToken(sys.intern(space_before))

449 if value: 449 ↛ 451line 449 didn't jump to line 451, because the condition on line 449 was never false

450 yield Deb822ValueToken(value)

451 if space_after:

452 yield Deb822WhitespaceToken(sys.intern(space_after))

453 yield Deb822NewlineAfterValueToken()

454 else:

455 yield Deb822ErrorToken(line)

456

457

458def _value_line_tokenizer(func):

459 # type: (Callable[[str], Iterable[Deb822Token]]) -> (Callable[[str], Iterable[Deb822Token]])

460 def impl(v):

461 # type: (str) -> Iterable[Deb822Token]

462 first_line = True

463 for no, line in enumerate(v.splitlines(keepends=True)):

464 assert not v.isspace() or no == 0

465 if line.startswith("#"):

466 yield Deb822CommentToken(line)

467 continue

468 has_newline = False

469 continuation_line_marker = None

470 if not first_line:

471 continuation_line_marker = line[0]

472 line = line[1:]

473 first_line = False

474 if line.endswith("\n"):

475 has_newline = True

476 line = line[:-1]

477 if continuation_line_marker is not None:

478 yield Deb822ValueContinuationToken(sys.intern(continuation_line_marker))

479 yield from func(line)

480 if has_newline:

481 yield Deb822NewlineAfterValueToken()

482

483 return impl

484

485

486@_value_line_tokenizer

487def whitespace_split_tokenizer(v):

488 # type: (str) -> Iterable[Deb822Token]

489 assert "\n" not in v

490 for match in _RE_WHITESPACE_SEPARATED_WORD_LIST.finditer(v):

491 space_before, word, space_after = match.groups()

492 if space_before:

493 yield Deb822SpaceSeparatorToken(sys.intern(space_before))

494 yield Deb822ValueToken(word)

495 if space_after:

496 yield Deb822SpaceSeparatorToken(sys.intern(space_after))

497

498

499@_value_line_tokenizer

500def comma_split_tokenizer(v):

501 # type: (str) -> Iterable[Deb822Token]

502 assert "\n" not in v

503 for match in _RE_COMMA_SEPARATED_WORD_LIST.finditer(v):

504 space_before_comma, comma, space_before_word, word, space_after_word = (

505 match.groups()

506 )

507 if space_before_comma:

508 yield Deb822WhitespaceToken(sys.intern(space_before_comma))

509 if comma:

510 yield Deb822CommaToken()

511 if space_before_word:

512 yield Deb822WhitespaceToken(sys.intern(space_before_word))

513 if word:

514 yield Deb822ValueToken(word)

515 if space_after_word:

516 yield Deb822WhitespaceToken(sys.intern(space_after_word))

Coverage for src/debputy/lsp/vendoring/_deb822_repro/tokens.py: 83%

230 statements