Coverage for src/debputy/lsp/spellchecking.py: 71%
152 statements
« prev ^ index » next coverage.py v7.2.7, created at 2024-04-07 12:14 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2024-04-07 12:14 +0200
1import functools
2import itertools
3import os
4import re
5import subprocess
6from typing import Iterable, FrozenSet, Tuple, Optional, List
8from debian.debian_support import Release
9from lsprotocol.types import Diagnostic, Range, Position, DiagnosticSeverity
11from debputy.lsp.quickfixes import propose_correct_text_quick_fix
12from debputy.lsp.text_util import LintCapablePositionCodec
13from debputy.util import _info, _warn
15_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"
16_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"
17_WORD_PARTS = re.compile(r"(\S+)")
18_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")
19_FIND_QUOTE_CHAR = re.compile(r'["`]')
20_LOOKS_LIKE_FILENAME = re.compile(
21 r"""
22 [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*
23 | [a-z0-9-_]+(/[a-z0-9]+)+/*
24 | [a-z0-9_]+(/[a-z0-9_]+){2,}/*
25 | (?:\S+)?[.][a-z]{1,3}
27""",
28 re.VERBOSE,
29)
30_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(
31 r"""
32 (
33 # Java identifier Camel Case
34 [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+
35 # Type name Camel Case
36 | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+
37 # Type name Camel Case with underscore (seen in Dh_Lib.pm among other
38 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+
39 # Perl module
40 | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+
41 # Probably an abbreviation
42 | [A-Z]{3,}
43 # Perl/Python identifiers or Jinja templates
44 | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?
45 # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)
46 | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?
47 | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#
48 # Subcommand names. Require at least two "-" to avoid skipping hyphenated words
49 | [a-z][a-z0-9]*(-[a-z0-9]+){2,}
50 # Short args
51 | -[a-z0-9]+
52 # Things like 32bit
53 | \d{2,}-?[a-z]+
54 # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)
55 | src:[a-z0-9][-+.a-z0-9]+
56 | [a-z0-9][-+.a-z0-9]+:(?:any|native)
57 # Version
58 | v\d+(?:[.]\S+)?
59 # chmod symbolic mode or math
60 | \S*=\S+
61 )
62""",
63 re.VERBOSE,
64)
65_LOOKS_LIKE_EMAIL = re.compile(
66 r"""
67 <[^>@\s]+@[^>@\s]+>
68""",
69 re.VERBOSE,
70)
71_NO_CORRECTIONS = tuple()
72_WORDLISTS = [
73 "debian-wordlist.dic",
74]
75_NAMELISTS = [
76 "logins-and-people.dic",
77]
78_PERSONAL_DICTS = [
79 "${HOME}/.hunspell_default",
80 "${HOME}/.hunspell_en_US",
81]
84try:
85 if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists( 85 ↛ 88line 85 didn't jump to line 88, because the condition on line 85 was never true
86 _SPELL_CHECKER_AFF
87 ):
88 raise ImportError
89 from hunspell import HunSpell
91 _HAS_HUNSPELL = True
92except ImportError:
93 _HAS_HUNSPELL = False
96def _read_wordlist(
97 base_dir: str, wordlist_name: str, *, namelist: bool = False
98) -> Iterable[str]:
99 with open(os.path.join(base_dir, wordlist_name)) as fd:
100 w = [w.strip() for w in fd]
101 yield from w
102 if namelist:
103 yield from (f"{n}'s" for n in w)
106def _all_debian_archs() -> Iterable[str]:
107 try:
108 output = subprocess.check_output(["dpkg-architecture", "-L"])
109 except (FileNotFoundError, subprocess.CalledProcessError) as e:
110 _warn(f"dpkg-architecture -L failed: {e}")
111 return tuple()
113 return (x.strip() for x in output.decode("utf-8").splitlines())
116@functools.lru_cache
117def _builtin_exception_words() -> FrozenSet[str]:
118 basedirs = os.path.dirname(__file__)
119 release_names = (x for x in Release.releases)
120 return frozenset(
121 itertools.chain(
122 itertools.chain.from_iterable(
123 _read_wordlist(basedirs, wl) for wl in _WORDLISTS
124 ),
125 itertools.chain.from_iterable(
126 _read_wordlist(basedirs, wl, namelist=True) for wl in _NAMELISTS
127 ),
128 release_names,
129 _all_debian_archs(),
130 )
131 )
134_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None
137def spellcheck_line(
138 lines: List[str],
139 position_codec: LintCapablePositionCodec,
140 line_no: int,
141 line: str,
142) -> Iterable[Diagnostic]:
143 spell_checker = default_spellchecker()
144 for word, pos, endpos in spell_checker.iter_words(line):
145 corrections = spell_checker.provide_corrections_for(word)
146 if not corrections:
147 continue
148 word_range_server_units = Range(
149 Position(line_no, pos),
150 Position(line_no, endpos),
151 )
152 word_range = position_codec.range_to_client_units(
153 lines,
154 word_range_server_units,
155 )
156 yield Diagnostic(
157 word_range,
158 f'Spelling "{word}"',
159 severity=DiagnosticSeverity.Hint,
160 source="debputy",
161 data=[propose_correct_text_quick_fix(c) for c in corrections],
162 )
165def default_spellchecker() -> "Spellchecker":
166 global _DEFAULT_SPELL_CHECKER
167 spellchecker = _DEFAULT_SPELL_CHECKER
168 if spellchecker is None:
169 if _HAS_HUNSPELL: 169 ↛ 172line 169 didn't jump to line 172, because the condition on line 169 was never false
170 spellchecker = HunspellSpellchecker()
171 else:
172 spellchecker = _do_nothing_spellchecker()
173 _DEFAULT_SPELL_CHECKER = spellchecker
174 return spellchecker
177@functools.lru_cache()
178def _do_nothing_spellchecker() -> "Spellchecker":
179 return EverythingIsCorrectSpellchecker()
182def disable_spellchecking() -> None:
183 global _DEFAULT_SPELL_CHECKER
184 _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()
187def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]:
188 current_pos = 0
189 while True:
190 try:
191 m = _FIND_QUOTE_CHAR.search(line, current_pos)
192 if m is None: 192 ↛ 198line 192 didn't jump to line 198, because the condition on line 192 was never false
193 if current_pos == 0: 193 ↛ 196line 193 didn't jump to line 196, because the condition on line 193 was never false
194 yield line, 0
195 else:
196 yield line[current_pos:], current_pos
197 return
198 starting_marker_pos = m.span()[0]
199 quote_char = m.group()
200 end_marker_pos = line.index(quote_char, starting_marker_pos + 1)
201 except ValueError:
202 yield line[current_pos:], current_pos
203 return
205 part = line[current_pos:starting_marker_pos]
207 if not part.isspace():
208 yield part, current_pos
209 current_pos = end_marker_pos + 1
212def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]:
213 for line_part, part_pos in _skip_quoted_parts(line):
214 for m in _WORD_PARTS.finditer(line_part):
215 fullword = m.group(1)
216 if fullword.startswith("--"): 216 ↛ 218line 216 didn't jump to line 218, because the condition on line 216 was never true
217 # CLI arg
218 continue
219 if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword): 219 ↛ 220line 219 didn't jump to line 220, because the condition on line 219 was never true
220 continue
221 if _LOOKS_LIKE_FILENAME.match(fullword): 221 ↛ 222line 221 didn't jump to line 222, because the condition on line 221 was never true
222 continue
223 if _LOOKS_LIKE_EMAIL.match(fullword): 223 ↛ 224line 223 didn't jump to line 224, because the condition on line 223 was never true
224 continue
225 mpos = m.span(1)[0]
226 for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):
227 pos, endpos = sm.span(1)
228 offset = part_pos + mpos
229 yield sm.group(1), pos + offset, endpos + offset
232class Spellchecker:
234 @staticmethod
235 def do_nothing_spellchecker() -> "Spellchecker":
236 return EverythingIsCorrectSpellchecker()
238 def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:
239 yield from _split_line_to_words(line)
241 def provide_corrections_for(self, word: str) -> Iterable[str]:
242 raise NotImplementedError
244 def ignore_word(self, word: str) -> None:
245 raise NotImplementedError
248class EverythingIsCorrectSpellchecker(Spellchecker):
249 def provide_corrections_for(self, word: str) -> Iterable[str]:
250 return _NO_CORRECTIONS
252 def ignore_word(self, word: str) -> None:
253 # It is hard to ignore words, when you never check them in the fist place.
254 pass
257class HunspellSpellchecker(Spellchecker):
259 def __init__(self) -> None:
260 self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)
261 for w in _builtin_exception_words():
262 self._checker.add(w)
263 self._load_personal_exclusions()
265 def provide_corrections_for(self, word: str) -> Iterable[str]:
266 if word.startswith( 266 ↛ 278line 266 didn't jump to line 278, because the condition on line 266 was never true
267 (
268 "dpkg-",
269 "dh-",
270 "dh_",
271 "debian-",
272 "debconf-",
273 "update-",
274 "DEB_",
275 "DPKG_",
276 )
277 ):
278 return _NO_CORRECTIONS
279 # 'ing is deliberately forcing a word into another word-class
280 if word.endswith(("'ing", "-nss")): 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true
281 return _NO_CORRECTIONS
282 return self._lookup(word)
284 @functools.lru_cache(128)
285 def _lookup(self, word: str) -> Iterable[str]:
286 if self._checker.spell(word): 286 ↛ 288line 286 didn't jump to line 288, because the condition on line 286 was never false
287 return _NO_CORRECTIONS
288 return self._checker.suggest(word)
290 def ignore_word(self, word: str) -> None:
291 self._checker.add(word)
293 def _load_personal_exclusions(self) -> None:
294 for filename in _PERSONAL_DICTS:
295 if filename.startswith("${"): 295 ↛ 302line 295 didn't jump to line 302, because the condition on line 295 was never false
296 end_index = filename.index("}")
297 varname = filename[2:end_index]
298 value = os.environ.get(varname)
299 if value is None: 299 ↛ 300line 299 didn't jump to line 300, because the condition on line 299 was never true
300 continue
301 filename = value + filename[end_index + 1 :]
302 if os.path.isfile(filename): 302 ↛ 303line 302 didn't jump to line 303, because the condition on line 302 was never true
303 _info(f"Loading personal spelling dictionary from {filename}")
304 self._checker.add_dic(filename)