summaryrefslogtreecommitdiffstats
path: root/src/debputy/lsp/spellchecking.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/debputy/lsp/spellchecking.py')
-rw-r--r--src/debputy/lsp/spellchecking.py304
1 files changed, 304 insertions, 0 deletions
diff --git a/src/debputy/lsp/spellchecking.py b/src/debputy/lsp/spellchecking.py
new file mode 100644
index 0000000..69dd119
--- /dev/null
+++ b/src/debputy/lsp/spellchecking.py
@@ -0,0 +1,304 @@
+import functools
+import itertools
+import os
+import re
+import subprocess
+from typing import Iterable, FrozenSet, Tuple, Optional, List
+
+from debian.debian_support import Release
+from lsprotocol.types import Diagnostic, Range, Position, DiagnosticSeverity
+
+from debputy.lsp.quickfixes import propose_correct_text_quick_fix
+from debputy.lsp.text_util import LintCapablePositionCodec
+from debputy.util import _info, _warn
+
+_SPELL_CHECKER_DICT = "/usr/share/hunspell/en_US.dic"
+_SPELL_CHECKER_AFF = "/usr/share/hunspell/en_US.aff"
+_WORD_PARTS = re.compile(r"(\S+)")
+_PRUNE_SYMBOLS_RE = re.compile(r"(\w+(?:-\w+|'\w+)?)")
+_FIND_QUOTE_CHAR = re.compile(r'["`]')
+_LOOKS_LIKE_FILENAME = re.compile(
+ r"""
+ [.]{0,3}/[a-z0-9]+(/[a-z0-9]+)+/*
+ | [a-z0-9-_]+(/[a-z0-9]+)+/*
+ | [a-z0-9_]+(/[a-z0-9_]+){2,}/*
+ | (?:\S+)?[.][a-z]{1,3}
+
+""",
+ re.VERBOSE,
+)
+_LOOKS_LIKE_PROGRAMMING_TERM = re.compile(
+ r"""
+ (
+ # Java identifier Camel Case
+ [a-z][a-z0-9]*(?:[A-Z]{1,3}[a-z0-9]+)+
+ # Type name Camel Case
+ | [A-Z]{1,3}[a-z0-9]+(?:[A-Z]{1,3}[a-z0-9]+)+
+ # Type name Camel Case with underscore (seen in Dh_Lib.pm among other
+ | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)+
+ # Perl module
+ | [A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*(::[A-Z]{1,3}[a-z0-9]+(?:_[A-Z]{1,3}[a-z0-9]+)*)+
+ # Probably an abbreviation
+ | [A-Z]{3,}
+ # Perl/Python identifiers or Jinja templates
+ | [$%&@_]?[{]?[{]?[a-z][a-z0-9]*(?:_[a-z0-9]+)+(?:(?:->)?[\[{]\S+|}}?)?
+ # SCREAMING_SNAKE_CASE (environment variables plus -DVAR=B or $FOO)
+ | [-$%&*_]{0,2}[A-Z][A-Z0-9]*(_[A-Z0-9]+)+(?:=\S+)?
+ | \#[A-Z][A-Z0-9]*(_[A-Z0-9]+)+\#
+ # Subcommand names. Require at least two "-" to avoid skipping hypenated words
+ | [a-z][a-z0-9]*(-[a-z0-9]+){2,}
+ # Short args
+ | -[a-z0-9]+
+ # Things like 32bit
+ | \d{2,}-?[a-z]+
+ # Source package (we do not have a package without prefix/suffix because it covers 95% of all lowercase words)
+ | src:[a-z0-9][-+.a-z0-9]+
+ | [a-z0-9][-+.a-z0-9]+:(?:any|native)
+ # Version
+ | v\d+(?:[.]\S+)?
+ # chmod symbolic mode or math
+ | \S*=\S+
+ )
+""",
+ re.VERBOSE,
+)
+_LOOKS_LIKE_EMAIL = re.compile(
+ r"""
+ <[^>@\s]+@[^>@\s]+>
+""",
+ re.VERBOSE,
+)
+_NO_CORRECTIONS = tuple()
+_WORDLISTS = [
+ "debian-wordlist.dic",
+]
+_NAMELISTS = [
+ "logins-and-people.dic",
+]
+_PERSONAL_DICTS = [
+ "${HOME}/.hunspell_default",
+ "${HOME}/.hunspell_en_US",
+]
+
+
+try:
+ if not os.path.lexists(_SPELL_CHECKER_DICT) or not os.path.lexists(
+ _SPELL_CHECKER_AFF
+ ):
+ raise ImportError
+ from hunspell import HunSpell
+
+ _HAS_HUNSPELL = True
+except ImportError:
+ _HAS_HUNSPELL = False
+
+
+def _read_wordlist(
+ base_dir: str, wordlist_name: str, *, namelist: bool = False
+) -> Iterable[str]:
+ with open(os.path.join(base_dir, wordlist_name)) as fd:
+ w = [w.strip() for w in fd]
+ yield from w
+ if namelist:
+ yield from (f"{n}'s" for n in w)
+
+
+def _all_debian_archs() -> Iterable[str]:
+ try:
+ output = subprocess.check_output(["dpkg-architecture", "-L"])
+ except (FileNotFoundError, subprocess.CalledProcessError) as e:
+ _warn(f"dpkg-architecture -L failed: {e}")
+ return tuple()
+
+ return (x.strip() for x in output.decode("utf-8").splitlines())
+
+
+@functools.lru_cache
+def _builtin_exception_words() -> FrozenSet[str]:
+ basedirs = os.path.dirname(__file__)
+ release_names = (x for x in Release.releases)
+ return frozenset(
+ itertools.chain(
+ itertools.chain.from_iterable(
+ _read_wordlist(basedirs, wl) for wl in _WORDLISTS
+ ),
+ itertools.chain.from_iterable(
+ _read_wordlist(basedirs, wl, namelist=True) for wl in _NAMELISTS
+ ),
+ release_names,
+ _all_debian_archs(),
+ )
+ )
+
+
+_DEFAULT_SPELL_CHECKER: Optional["Spellchecker"] = None
+
+
+def spellcheck_line(
+ lines: List[str],
+ position_codec: LintCapablePositionCodec,
+ line_no: int,
+ line: str,
+) -> Iterable[Diagnostic]:
+ spell_checker = default_spellchecker()
+ for word, pos, endpos in spell_checker.iter_words(line):
+ corrections = spell_checker.provide_corrections_for(word)
+ if not corrections:
+ continue
+ word_range_server_units = Range(
+ Position(line_no, pos),
+ Position(line_no, endpos),
+ )
+ word_range = position_codec.range_to_client_units(
+ lines,
+ word_range_server_units,
+ )
+ yield Diagnostic(
+ word_range,
+ f'Spelling "{word}"',
+ severity=DiagnosticSeverity.Hint,
+ source="debputy",
+ data=[propose_correct_text_quick_fix(c) for c in corrections],
+ )
+
+
+def default_spellchecker() -> "Spellchecker":
+ global _DEFAULT_SPELL_CHECKER
+ spellchecker = _DEFAULT_SPELL_CHECKER
+ if spellchecker is None:
+ if _HAS_HUNSPELL:
+ spellchecker = HunspellSpellchecker()
+ else:
+ spellchecker = _do_nothing_spellchecker()
+ _DEFAULT_SPELL_CHECKER = spellchecker
+ return spellchecker
+
+
+@functools.lru_cache()
+def _do_nothing_spellchecker() -> "Spellchecker":
+ return EverythingIsCorrectSpellchecker()
+
+
+def disable_spellchecking() -> None:
+ global _DEFAULT_SPELL_CHECKER
+ _DEFAULT_SPELL_CHECKER = _do_nothing_spellchecker()
+
+
+def _skip_quoted_parts(line: str) -> Iterable[Tuple[str, int]]:
+ current_pos = 0
+ while True:
+ try:
+ m = _FIND_QUOTE_CHAR.search(line, current_pos)
+ if m is None:
+ if current_pos == 0:
+ yield line, 0
+ else:
+ yield line[current_pos:], current_pos
+ return
+ starting_marker_pos = m.span()[0]
+ quote_char = m.group()
+ end_marker_pos = line.index(quote_char, starting_marker_pos + 1)
+ except ValueError:
+ yield line[current_pos:], current_pos
+ return
+
+ part = line[current_pos:starting_marker_pos]
+
+ if not part.isspace():
+ yield part, current_pos
+ current_pos = end_marker_pos + 1
+
+
+def _split_line_to_words(line: str) -> Iterable[Tuple[str, int, int]]:
+ for line_part, part_pos in _skip_quoted_parts(line):
+ for m in _WORD_PARTS.finditer(line_part):
+ fullword = m.group(1)
+ if fullword.startswith("--"):
+ # CLI arg
+ continue
+ if _LOOKS_LIKE_PROGRAMMING_TERM.match(fullword):
+ continue
+ if _LOOKS_LIKE_FILENAME.match(fullword):
+ continue
+ if _LOOKS_LIKE_EMAIL.match(fullword):
+ continue
+ mpos = m.span(1)[0]
+ for sm in _PRUNE_SYMBOLS_RE.finditer(fullword):
+ pos, endpos = sm.span(1)
+ offset = part_pos + mpos
+ yield sm.group(1), pos + offset, endpos + offset
+
+
+class Spellchecker:
+
+ @staticmethod
+ def do_nothing_spellchecker() -> "Spellchecker":
+ return EverythingIsCorrectSpellchecker()
+
+ def iter_words(self, line: str) -> Iterable[Tuple[str, int, int]]:
+ yield from _split_line_to_words(line)
+
+ def provide_corrections_for(self, word: str) -> Iterable[str]:
+ raise NotImplementedError
+
+ def ignore_word(self, word: str) -> None:
+ raise NotImplementedError
+
+
+class EverythingIsCorrectSpellchecker(Spellchecker):
+ def provide_corrections_for(self, word: str) -> Iterable[str]:
+ return _NO_CORRECTIONS
+
+ def ignore_word(self, word: str) -> None:
+ # It is hard to ignore words, when you never check them in the fist place.
+ pass
+
+
+class HunspellSpellchecker(Spellchecker):
+
+ def __init__(self) -> None:
+ self._checker = HunSpell(_SPELL_CHECKER_DICT, _SPELL_CHECKER_AFF)
+ for w in _builtin_exception_words():
+ self._checker.add(w)
+ self._load_personal_exclusions()
+
+ def provide_corrections_for(self, word: str) -> Iterable[str]:
+ if word.startswith(
+ (
+ "dpkg-",
+ "dh-",
+ "dh_",
+ "debian-",
+ "debconf-",
+ "update-",
+ "DEB_",
+ "DPKG_",
+ )
+ ):
+ return _NO_CORRECTIONS
+ # 'ing is deliberately forcing a word into another word-class
+ if word.endswith(("'ing", "-nss")):
+ return _NO_CORRECTIONS
+ return self._lookup(word)
+
+ @functools.lru_cache(128)
+ def _lookup(self, word: str) -> Iterable[str]:
+ if self._checker.spell(word):
+ return _NO_CORRECTIONS
+ return self._checker.suggest(word)
+
+ def ignore_word(self, word: str) -> None:
+ self._checker.add(word)
+
+ def _load_personal_exclusions(self) -> None:
+ for filename in _PERSONAL_DICTS:
+ if filename.startswith("${"):
+ end_index = filename.index("}")
+ varname = filename[2:end_index]
+ value = os.environ.get(varname)
+ if value is None:
+ continue
+ filename = value + filename[end_index + 1 :]
+ if os.path.isfile(filename):
+ _info(f"Loading personal spelling dictionary from {filename}")
+ self._checker.add_dic(filename)