Coverage for src/debputy/lsp/spellchecking.py: 71%

152 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2024-04-07 12:14 +0200

1import functools 

2import itertools 

3import os 

4import re 

5import subprocess 

6from typing import Iterable, FrozenSet, Tuple, Optional, List 

7 

8from debian.debian_support import Release 

9from lsprotocol.types import Diagnostic, Range, Position, DiagnosticSeverity 

10 

11from debputy.lsp.quickfixes import propose_correct_text_quick_fix 

12from debputy.lsp.text_util import LintCapablePositionCodec 

13from debputy.util import _info, _warn 

14 

15_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic" 

16_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff" 

17_WORD_PARTS = re.compile(r"(\S+)") 

18_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)") 

19_FIND_QUOTE_CHAR = re.compile(r'["`]') 

20_LOOKS_LIKE_FILENAME = re.compile( 

21 r""" 

22 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/* 

23 | [a-z0-9-_]+(/[a-z0-9]+)+/* 

24 | [a-z0-9_]+(/[a-z0-9_]+){2,}/* 

25 | (?:\S+)?[.][a-z]{1,3} 

26 

27""", 

28 re.VERBOSE, 

29) 

30_LOOKS_LIKE_PROGRAMMING_TERM = re.compile( 

31 r""" 

32 ( 

33 # Java identifier Camel Case 

34 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+ 

35 # Type name Camel Case 

36 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+ 

37 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other 

38 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+ 

39 # Perl module 

40 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+ 

41 # Probably an abbreviation 

42 | [A-Z]{3,} 

43 # Perl/Python identifiers or Jinja templates 

44 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)? 

45 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO) 

46 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)? 

47 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\# 

48 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words 

49 | [a-z][a-z0-9]*(-[a-z0-9]+){2,} 

50 # Short args 

51 | -[a-z0-9]+ 

52 # Things like 32bit 

53 | \d{2,}-?[a-z]+ 

54 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words) 

55 | src:[a-z0-9][-+.a-z0-9]+ 

56 | [a-z0-9][-+.a-z0-9]+:(?:any|native) 

57 # Version 

58 | v\d+(?:[.]\S+)? 

59 # chmod symbolic mode or math 

60 | \S*=\S+ 

61 ) 

62""", 

63 re.VERBOSE, 

64) 

65_LOOKS_LIKE_EMAIL = re.compile( 

66 r""" 

67 <[^>@\s]+@[^>@\s]+> 

68""", 

69 re.VERBOSE, 

70) 

71_NO_CORRECTIONS = tuple() 

72_WORDLISTS = [ 

73 "debian-wordlist.dic", 

74] 

75_NAMELISTS = [ 

76 "logins-and-people.dic", 

77] 

78_PERSONAL_DICTS = [ 

79 "${HOME}/.hunspell_default", 

80 "${HOME}/.hunspell_en_US", 

81] 

82 

83 

84try: 

85 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 85 ↛ 88line 85 didn't jump to line 88, because the condition on line 85 was never true

86 _SPELL_CHECKER_AFF 

87 ): 

88 raise ImportError 

89 from hunspell import HunSpell 

90 

91 _HAS_HUNSPELL = True 

92except ImportError: 

93 _HAS_HUNSPELL = False 

94 

95 

96def _read_wordlist( 

97 base_dir: str, wordlist_name: str, *, namelist: bool = False 

98) -> Iterable[str]: 

99 with open(os.path.join(base_dir, wordlist_name)) as fd: 

100 w = [w.strip() for w in fd] 

101 yield from w 

102 if namelist: 

103 yield from (f"{n}'s" for n in w) 

104 

105 

106def _all_debian_archs() -> Iterable[str]: 

107 try: 

108 output = subprocess.check_output(["dpkg-architecture", "-L"]) 

109 except (FileNotFoundError, subprocess.CalledProcessError) as e: 

110 _warn(f"dpkg-architecture -L failed: {e}") 

111 return tuple() 

112 

113 return (x.strip() for x in output.decode("utf-8").splitlines()) 

114 

115 

116@functools.lru_cache 

117def _builtin_exception_words() -> FrozenSet[str]: 

118 basedirs = os.path.dirname(__file__) 

119 release_names = (x for x in Release.releases) 

120 return frozenset( 

121 itertools.chain( 

122 itertools.chain.from_iterable( 

123 _read_wordlist(basedirs, wl) for wl in _WORDLISTS 

124 ), 

125 itertools.chain.from_iterable( 

126 _read_wordlist(basedirs, wl, namelist=True) for wl in _NAMELISTS 

127 ), 

128 release_names, 

129 _all_debian_archs(), 

130 ) 

131 ) 

132 

133 

134_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None 

135 

136 

137def spellcheck_line( 

138 lines: List[str], 

139 position_codec: LintCapablePositionCodec, 

140 line_no: int, 

141 line: str, 

142) -> Iterable[Diagnostic]: 

143 spell_checker = default_spellchecker() 

144 for word, pos, endpos in spell_checker.iter_words(line): 

145 corrections = spell_checker.provide_corrections_for(word) 

146 if not corrections: 

147 continue 

148 word_range_server_units = Range( 

149 Position(line_no, pos), 

150 Position(line_no, endpos), 

151 ) 

152 word_range = position_codec.range_to_client_units( 

153 lines, 

154 word_range_server_units, 

155 ) 

156 yield Diagnostic( 

157 word_range, 

158 f'Spelling "{word}"', 

159 severity=DiagnosticSeverity.Hint, 

160 source="debputy", 

161 data=[propose_correct_text_quick_fix(c) for c in corrections], 

162 ) 

163 

164 

165def default_spellchecker() -> "Spellchecker": 

166 global _DEFAULT_SPELL_CHECKER 

167 spellchecker = _DEFAULT_SPELL_CHECKER 

168 if spellchecker is None: 

169 if _HAS_HUNSPELL: 169 ↛ 172line 169 didn't jump to line 172, because the condition on line 169 was never false

170 spellchecker = HunspellSpellchecker() 

171 else: 

172 spellchecker = _do_nothing_spellchecker() 

173 _DEFAULT_SPELL_CHECKER = spellchecker 

174 return spellchecker 

175 

176 

177@functools.lru_cache() 

178def _do_nothing_spellchecker() -> "Spellchecker": 

179 return EverythingIsCorrectSpellchecker() 

180 

181 

182def disable_spellchecking() -> None: 

183 global _DEFAULT_SPELL_CHECKER 

184 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker() 

185 

186 

187def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]: 

188 current_pos = 0 

189 while True: 

190 try: 

191 m = _FIND_QUOTE_CHAR.search(line, current_pos) 

192 if m is None: 192 ↛ 198line 192 didn't jump to line 198, because the condition on line 192 was never false

193 if current_pos == 0: 193 ↛ 196line 193 didn't jump to line 196, because the condition on line 193 was never false

194 yield line, 0 

195 else: 

196 yield line[current_pos:], current_pos 

197 return 

198 starting_marker_pos = m.span()[0] 

199 quote_char = m.group() 

200 end_marker_pos = line.index(quote_char, starting_marker_pos + 1) 

201 except ValueError: 

202 yield line[current_pos:], current_pos 

203 return 

204 

205 part = line[current_pos:starting_marker_pos] 

206 

207 if not part.isspace(): 

208 yield part, current_pos 

209 current_pos = end_marker_pos + 1 

210 

211 

212def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]: 

213 for line_part, part_pos in _skip_quoted_parts(line): 

214 for m in _WORD_PARTS.finditer(line_part): 

215 fullword = m.group(1) 

216 if fullword.startswith("--"): 216 ↛ 218line 216 didn't jump to line 218, because the condition on line 216 was never true

217 # CLI arg 

218 continue 

219 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword): 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true

220 continue 

221 if _LOOKS_LIKE_FILENAME.match(fullword): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true

222 continue 

223 if _LOOKS_LIKE_EMAIL.match(fullword): 223 ↛ 224line 223 didn't jump to line 224, because the condition on line 223 was never true

224 continue 

225 mpos = m.span(1)[0] 

226 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword): 

227 pos, endpos = sm.span(1) 

228 offset = part_pos + mpos 

229 yield sm.group(1), pos + offset, endpos + offset 

230 

231 

232class Spellchecker: 

233 

234 @staticmethod 

235 def do_nothing_spellchecker() -> "Spellchecker": 

236 return EverythingIsCorrectSpellchecker() 

237 

238 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]: 

239 yield from _split_line_to_words(line) 

240 

241 def provide_corrections_for(self, word: str) -> Iterable[str]: 

242 raise NotImplementedError 

243 

244 def ignore_word(self, word: str) -> None: 

245 raise NotImplementedError 

246 

247 

248class EverythingIsCorrectSpellchecker(Spellchecker): 

249 def provide_corrections_for(self, word: str) -> Iterable[str]: 

250 return _NO_CORRECTIONS 

251 

252 def ignore_word(self, word: str) -> None: 

253 # It is hard to ignore words, when you never check them in the fist place. 

254 pass 

255 

256 

257class HunspellSpellchecker(Spellchecker): 

258 

259 def __init__(self) -> None: 

260 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF) 

261 for w in _builtin_exception_words(): 

262 self._checker.add(w) 

263 self._load_personal_exclusions() 

264 

265 def provide_corrections_for(self, word: str) -> Iterable[str]: 

266 if word.startswith( 266 ↛ 278line 266 didn't jump to line 278, because the condition on line 266 was never true

267 ( 

268 "dpkg-", 

269 "dh-", 

270 "dh_", 

271 "debian-", 

272 "debconf-", 

273 "update-", 

274 "DEB_", 

275 "DPKG_", 

276 ) 

277 ): 

278 return _NO_CORRECTIONS 

279 # 'ing is deliberately forcing a word into another word-class 

280 if word.endswith(("'ing", "-nss")): 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true

281 return _NO_CORRECTIONS 

282 return self._lookup(word) 

283 

284 @functools.lru_cache(128) 

285 def _lookup(self, word: str) -> Iterable[str]: 

286 if self._checker.spell(word): 286 ↛ 288line 286 didn't jump to line 288, because the condition on line 286 was never false

287 return _NO_CORRECTIONS 

288 return self._checker.suggest(word) 

289 

290 def ignore_word(self, word: str) -> None: 

291 self._checker.add(word) 

292 

293 def _load_personal_exclusions(self) -> None: 

294 for filename in _PERSONAL_DICTS: 

295 if filename.startswith("${"): 295 ↛ 302line 295 didn't jump to line 302, because the condition on line 295 was never false

296 end_index = filename.index("}") 

297 varname = filename[2:end_index] 

298 value = os.environ.get(varname) 

299 if value is None: 299 ↛ 300line 299 didn't jump to line 300, because the condition on line 299 was never true

300 continue 

301 filename = value + filename[end_index + 1 :] 

302 if os.path.isfile(filename): 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true

303 _info(f"Loading personal spelling dictionary from {filename}") 

304 self._checker.add_dic(filename)