summaryrefslogtreecommitdiffstats
path: root/src/debputy/lsp/spellchecking.py
blob: f9027af63418d1e0fc8b7abc4662b78a73108a81 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
import functools
import itertools
import os
import re
import subprocess
from typing import Iterable, FrozenSet, Tuple, Optional, List

from debian.debian_support import Release
from lsprotocol.types import Diagnostic, Range, Position, DiagnosticSeverity

from debputy.lsp.quickfixes import propose_correct_text_quick_fix
from debputy.lsp.text_util import LintCapablePositionCodec
from debputy.util import _info, _warn

_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"
_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"
_WORD_PARTS = re.compile(r"(\S+)")
_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")
_FIND_QUOTE_CHAR = re.compile(r'["`]')
_LOOKS_LIKE_FILENAME = re.compile(
    r"""
      [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*
    | [a-z0-9-_]+(/[a-z0-9]+)+/*
    | [a-z0-9_]+(/[a-z0-9_]+){2,}/*
    | (?:\S+)?[.][a-z]{1,3}

""",
    re.VERBOSE,
)
_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(
    r"""
    (
        # Java identifier Camel Case
          [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+
        # Type name Camel Case
        | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+
        # Type name Camel Case with underscore (seen in Dh_Lib.pm among other
        | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+
        # Perl module
        | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+
        # Probably an abbreviation
        | [A-Z]{3,}
        # Perl/Python identifiers or Jinja templates
        | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?
        # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)
        | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?
        | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#
        # Subcommand names. Require at least two "-" to avoid skipping hyphenated words
        | [a-z][a-z0-9]*(-[a-z0-9]+){2,}
        # Short args
        | -[a-z0-9]+
        # Things like 32bit
        | \d{2,}-?[a-z]+
        # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)
        | src:[a-z0-9][-+.a-z0-9]+
        | [a-z0-9][-+.a-z0-9]+:(?:any|native)
        # Version
        | v\d+(?:[.]\S+)?
        # chmod symbolic mode or math
        | \S*=\S+
    )
""",
    re.VERBOSE,
)
_LOOKS_LIKE_EMAIL = re.compile(
    r"""
    <[^>@\s]+@[^>@\s]+>
""",
    re.VERBOSE,
)
_NO_CORRECTIONS = tuple()
_WORDLISTS = [
    "debian-wordlist.dic",
]
_NAMELISTS = [
    "logins-and-people.dic",
]
_PERSONAL_DICTS = [
    "${HOME}/.hunspell_default",
    "${HOME}/.hunspell_en_US",
]


try:
    if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists(
        _SPELL_CHECKER_AFF
    ):
        raise ImportError
    from hunspell import HunSpell

    _HAS_HUNSPELL = True
except ImportError:
    _HAS_HUNSPELL = False


def _read_wordlist(
    base_dir: str, wordlist_name: str, *, namelist: bool = False
) -> Iterable[str]:
    with open(os.path.join(base_dir, wordlist_name)) as fd:
        w = [w.strip() for w in fd]
        yield from w
        if namelist:
            yield from (f"{n}'s" for n in w)


def _all_debian_archs() -> Iterable[str]:
    try:
        output = subprocess.check_output(["dpkg-architecture", "-L"])
    except (FileNotFoundError, subprocess.CalledProcessError) as e:
        _warn(f"dpkg-architecture -L failed: {e}")
        return tuple()

    return (x.strip() for x in output.decode("utf-8").splitlines())


@functools.lru_cache
def _builtin_exception_words() -> FrozenSet[str]:
    basedirs = os.path.dirname(__file__)
    release_names = (x for x in Release.releases)
    return frozenset(
        itertools.chain(
            itertools.chain.from_iterable(
                _read_wordlist(basedirs, wl) for wl in _WORDLISTS
            ),
            itertools.chain.from_iterable(
                _read_wordlist(basedirs, wl, namelist=True) for wl in _NAMELISTS
            ),
            release_names,
            _all_debian_archs(),
        )
    )


_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None


def spellcheck_line(
    lines: List[str],
    position_codec: LintCapablePositionCodec,
    line_no: int,
    line: str,
) -> Iterable[Diagnostic]:
    spell_checker = default_spellchecker()
    for word, pos, endpos in spell_checker.iter_words(line):
        corrections = spell_checker.provide_corrections_for(word)
        if not corrections:
            continue
        word_range_server_units = Range(
            Position(line_no, pos),
            Position(line_no, endpos),
        )
        word_range = position_codec.range_to_client_units(
            lines,
            word_range_server_units,
        )
        yield Diagnostic(
            word_range,
            f'Spelling "{word}"',
            severity=DiagnosticSeverity.Hint,
            source="debputy",
            data=[propose_correct_text_quick_fix(c) for c in corrections],
        )


def default_spellchecker() -> "Spellchecker":
    global _DEFAULT_SPELL_CHECKER
    spellchecker = _DEFAULT_SPELL_CHECKER
    if spellchecker is None:
        if _HAS_HUNSPELL:
            spellchecker = HunspellSpellchecker()
        else:
            spellchecker = _do_nothing_spellchecker()
        _DEFAULT_SPELL_CHECKER = spellchecker
    return spellchecker


@functools.lru_cache()
def _do_nothing_spellchecker() -> "Spellchecker":
    return EverythingIsCorrectSpellchecker()


def disable_spellchecking() -> None:
    global _DEFAULT_SPELL_CHECKER
    _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()


def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]:
    current_pos = 0
    while True:
        try:
            m = _FIND_QUOTE_CHAR.search(line, current_pos)
            if m is None:
                if current_pos == 0:
                    yield line, 0
                else:
                    yield line[current_pos:], current_pos
                return
            starting_marker_pos = m.span()[0]
            quote_char = m.group()
            end_marker_pos = line.index(quote_char, starting_marker_pos + 1)
        except ValueError:
            yield line[current_pos:], current_pos
            return

        part = line[current_pos:starting_marker_pos]

        if not part.isspace():
            yield part, current_pos
        current_pos = end_marker_pos + 1


def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]:
    for line_part, part_pos in _skip_quoted_parts(line):
        for m in _WORD_PARTS.finditer(line_part):
            fullword = m.group(1)
            if fullword.startswith("--"):
                # CLI arg
                continue
            if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword):
                continue
            if _LOOKS_LIKE_FILENAME.match(fullword):
                continue
            if _LOOKS_LIKE_EMAIL.match(fullword):
                continue
            mpos = m.span(1)[0]
            for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):
                pos, endpos = sm.span(1)
                offset = part_pos + mpos
                yield sm.group(1), pos + offset, endpos + offset


class Spellchecker:

    @staticmethod
    def do_nothing_spellchecker() -> "Spellchecker":
        return EverythingIsCorrectSpellchecker()

    def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:
        yield from _split_line_to_words(line)

    def provide_corrections_for(self, word: str) -> Iterable[str]:
        raise NotImplementedError

    def ignore_word(self, word: str) -> None:
        raise NotImplementedError


class EverythingIsCorrectSpellchecker(Spellchecker):
    def provide_corrections_for(self, word: str) -> Iterable[str]:
        return _NO_CORRECTIONS

    def ignore_word(self, word: str) -> None:
        # It is hard to ignore words, when you never check them in the fist place.
        pass


class HunspellSpellchecker(Spellchecker):

    def __init__(self) -> None:
        self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)
        for w in _builtin_exception_words():
            self._checker.add(w)
        self._load_personal_exclusions()

    def provide_corrections_for(self, word: str) -> Iterable[str]:
        if word.startswith(
            (
                "dpkg-",
                "dh-",
                "dh_",
                "debian-",
                "debconf-",
                "update-",
                "DEB_",
                "DPKG_",
            )
        ):
            return _NO_CORRECTIONS
        # 'ing is deliberately forcing a word into another word-class
        if word.endswith(("'ing", "-nss")):
            return _NO_CORRECTIONS
        return self._lookup(word)

    @functools.lru_cache(128)
    def _lookup(self, word: str) -> Iterable[str]:
        if self._checker.spell(word):
            return _NO_CORRECTIONS
        return self._checker.suggest(word)

    def ignore_word(self, word: str) -> None:
        self._checker.add(word)

    def _load_personal_exclusions(self) -> None:
        for filename in _PERSONAL_DICTS:
            if filename.startswith("${"):
                end_index = filename.index("}")
                varname = filename[2:end_index]
                value = os.environ.get(varname)
                if value is None:
                    continue
                filename = value + filename[end_index + 1 :]
            if os.path.isfile(filename):
                _info(f"Loading personal spelling dictionary from {filename}")
                self._checker.add_dic(filename)