Coverage for src/debputy/lsp/spellchecking.py: 71%

1import functools

2import itertools

3import os

4import re

5import subprocess

6from typing import Iterable, FrozenSet, Tuple, Optional, List

8from debian.debian_support import Release

9from lsprotocol.types import Diagnostic, Range, Position, DiagnosticSeverity

11from debputy.lsp.quickfixes import propose_correct_text_quick_fix

12from debputy.lsp.text_util import LintCapablePositionCodec

13from debputy.util import _info, _warn

15_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"

16_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"

17_WORD_PARTS = re.compile(r"(\S+)")

18_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")

19_FIND_QUOTE_CHAR = re.compile(r'["`]')

20_LOOKS_LIKE_FILENAME = re.compile(

21 r"""

22 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*

23 | [a-z0-9-_]+(/[a-z0-9]+)+/*

24 | [a-z0-9_]+(/[a-z0-9_]+){2,}/*

25 | (?:\S+)?[.][a-z]{1,3}

27""",

28 re.VERBOSE,

29)

30_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(

31 r"""

32 (

33 # Java identifier Camel Case

34 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+

35 # Type name Camel Case

36 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+

37 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other

38 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+

39 # Perl module

40 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+

41 # Probably an abbreviation

42 | [A-Z]{3,}

43 # Perl/Python identifiers or Jinja templates

44 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?

45 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)

46 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?

47 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#

48 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words

49 | [a-z][a-z0-9]*(-[a-z0-9]+){2,}

50 # Short args

51 | -[a-z0-9]+

52 # Things like 32bit

53 | \d{2,}-?[a-z]+

54 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)

55 | src:[a-z0-9][-+.a-z0-9]+

56 | [a-z0-9][-+.a-z0-9]+:(?:any|native)

57 # Version

58 | v\d+(?:[.]\S+)?

59 # chmod symbolic mode or math

60 | \S*=\S+

61 )

62""",

63 re.VERBOSE,

64)

65_LOOKS_LIKE_EMAIL = re.compile(

66 r"""

67 <[^>@\s]+@[^>@\s]+>

68""",

69 re.VERBOSE,

70)

71_NO_CORRECTIONS = tuple()

72_WORDLISTS = [

73 "debian-wordlist.dic",

74]

75_NAMELISTS = [

76 "logins-and-people.dic",

77]

78_PERSONAL_DICTS = [

79 "${HOME}/.hunspell_default",

80 "${HOME}/.hunspell_en_US",

81]

84try:

85 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 85 ↛ 88line 85 didn't jump to line 88, because the condition on line 85 was never true

86 _SPELL_CHECKER_AFF

87 ):

88 raise ImportError

89 from hunspell import HunSpell

91 _HAS_HUNSPELL = True

92except ImportError:

93 _HAS_HUNSPELL = False

96def _read_wordlist(

97 base_dir: str, wordlist_name: str, *, namelist: bool = False

98) -> Iterable[str]:

99 with open(os.path.join(base_dir, wordlist_name)) as fd:

100 w = [w.strip() for w in fd]

101 yield from w

102 if namelist:

103 yield from (f"{n}'s" for n in w)

104

105

106def _all_debian_archs() -> Iterable[str]:

107 try:

108 output = subprocess.check_output(["dpkg-architecture", "-L"])

109 except (FileNotFoundError, subprocess.CalledProcessError) as e:

110 _warn(f"dpkg-architecture -L failed: {e}")

111 return tuple()

112

113 return (x.strip() for x in output.decode("utf-8").splitlines())

114

115

116@functools.lru_cache

117def _builtin_exception_words() -> FrozenSet[str]:

118 basedirs = os.path.dirname(__file__)

119 release_names = (x for x in Release.releases)

120 return frozenset(

121 itertools.chain(

122 itertools.chain.from_iterable(

123 _read_wordlist(basedirs, wl) for wl in _WORDLISTS

124 ),

125 itertools.chain.from_iterable(

126 _read_wordlist(basedirs, wl, namelist=True) for wl in _NAMELISTS

127 ),

128 release_names,

129 _all_debian_archs(),

130 )

131 )

132

133

134_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None

135

136

137def spellcheck_line(

138 lines: List[str],

139 position_codec: LintCapablePositionCodec,

140 line_no: int,

141 line: str,

142) -> Iterable[Diagnostic]:

143 spell_checker = default_spellchecker()

144 for word, pos, endpos in spell_checker.iter_words(line):

145 corrections = spell_checker.provide_corrections_for(word)

146 if not corrections:

147 continue

148 word_range_server_units = Range(

149 Position(line_no, pos),

150 Position(line_no, endpos),

151 )

152 word_range = position_codec.range_to_client_units(

153 lines,

154 word_range_server_units,

155 )

156 yield Diagnostic(

157 word_range,

158 f'Spelling "{word}"',

159 severity=DiagnosticSeverity.Hint,

160 source="debputy",

161 data=[propose_correct_text_quick_fix(c) for c in corrections],

162 )

163

164

165def default_spellchecker() -> "Spellchecker":

166 global _DEFAULT_SPELL_CHECKER

167 spellchecker = _DEFAULT_SPELL_CHECKER

168 if spellchecker is None:

169 if _HAS_HUNSPELL: 169 ↛ 172line 169 didn't jump to line 172, because the condition on line 169 was never false

170 spellchecker = HunspellSpellchecker()

171 else:

172 spellchecker = _do_nothing_spellchecker()

173 _DEFAULT_SPELL_CHECKER = spellchecker

174 return spellchecker

175

176

177@functools.lru_cache()

178def _do_nothing_spellchecker() -> "Spellchecker":

179 return EverythingIsCorrectSpellchecker()

180

181

182def disable_spellchecking() -> None:

183 global _DEFAULT_SPELL_CHECKER

184 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()

185

186

187def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]:

188 current_pos = 0

189 while True:

190 try:

191 m = _FIND_QUOTE_CHAR.search(line, current_pos)

192 if m is None: 192 ↛ 198line 192 didn't jump to line 198, because the condition on line 192 was never false

193 if current_pos == 0: 193 ↛ 196line 193 didn't jump to line 196, because the condition on line 193 was never false

194 yield line, 0

195 else:

196 yield line[current_pos:], current_pos

197 return

198 starting_marker_pos = m.span()[0]

199 quote_char = m.group()

200 end_marker_pos = line.index(quote_char, starting_marker_pos + 1)

201 except ValueError:

202 yield line[current_pos:], current_pos

203 return

204

205 part = line[current_pos:starting_marker_pos]

206

207 if not part.isspace():

208 yield part, current_pos

209 current_pos = end_marker_pos + 1

210

211

212def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]:

213 for line_part, part_pos in _skip_quoted_parts(line):

214 for m in _WORD_PARTS.finditer(line_part):

215 fullword = m.group(1)

216 if fullword.startswith("--"): 216 ↛ 218line 216 didn't jump to line 218, because the condition on line 216 was never true

217 # CLI arg

218 continue

219 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword): 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true

220 continue

221 if _LOOKS_LIKE_FILENAME.match(fullword): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 continue

223 if _LOOKS_LIKE_EMAIL.match(fullword): 223 ↛ 224line 223 didn't jump to line 224, because the condition on line 223 was never true

224 continue

225 mpos = m.span(1)[0]

226 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):

227 pos, endpos = sm.span(1)

228 offset = part_pos + mpos

229 yield sm.group(1), pos + offset, endpos + offset

230

231

232class Spellchecker:

233

234 @staticmethod

235 def do_nothing_spellchecker() -> "Spellchecker":

236 return EverythingIsCorrectSpellchecker()

237

238 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:

239 yield from _split_line_to_words(line)

240

241 def provide_corrections_for(self, word: str) -> Iterable[str]:

242 raise NotImplementedError

243

244 def ignore_word(self, word: str) -> None:

245 raise NotImplementedError

246

247

248class EverythingIsCorrectSpellchecker(Spellchecker):

249 def provide_corrections_for(self, word: str) -> Iterable[str]:

250 return _NO_CORRECTIONS

251

252 def ignore_word(self, word: str) -> None:

253 # It is hard to ignore words, when you never check them in the fist place.

254 pass

255

256

257class HunspellSpellchecker(Spellchecker):

258

259 def __init__(self) -> None:

260 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)

261 for w in _builtin_exception_words():

262 self._checker.add(w)

263 self._load_personal_exclusions()

264

265 def provide_corrections_for(self, word: str) -> Iterable[str]:

266 if word.startswith( 266 ↛ 278line 266 didn't jump to line 278, because the condition on line 266 was never true

267 (

268 "dpkg-",

269 "dh-",

270 "dh_",

271 "debian-",

272 "debconf-",

273 "update-",

274 "DEB_",

275 "DPKG_",

276 )

277 ):

278 return _NO_CORRECTIONS

279 # 'ing is deliberately forcing a word into another word-class

280 if word.endswith(("'ing", "-nss")): 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true

281 return _NO_CORRECTIONS

282 return self._lookup(word)

283

284 @functools.lru_cache(128)

285 def _lookup(self, word: str) -> Iterable[str]:

286 if self._checker.spell(word): 286 ↛ 288line 286 didn't jump to line 288, because the condition on line 286 was never false

287 return _NO_CORRECTIONS

288 return self._checker.suggest(word)

289

290 def ignore_word(self, word: str) -> None:

291 self._checker.add(word)

292

293 def _load_personal_exclusions(self) -> None:

294 for filename in _PERSONAL_DICTS:

295 if filename.startswith("${"): 295 ↛ 302line 295 didn't jump to line 302, because the condition on line 295 was never false

296 end_index = filename.index("}")

297 varname = filename[2:end_index]

298 value = os.environ.get(varname)

299 if value is None: 299 ↛ 300line 299 didn't jump to line 300, because the condition on line 299 was never true

300 continue

301 filename = value + filename[end_index + 1 :]

302 if os.path.isfile(filename): 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true

303 _info(f"Loading personal spelling dictionary from {filename}")

304 self._checker.add_dic(filename)