summaryrefslogtreecommitdiffstats
path: root/lib/Lintian/Spelling.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lintian/Spelling.pm')
-rw-r--r--lib/Lintian/Spelling.pm295
1 files changed, 295 insertions, 0 deletions
diff --git a/lib/Lintian/Spelling.pm b/lib/Lintian/Spelling.pm
new file mode 100644
index 0000000..1a8d7fc
--- /dev/null
+++ b/lib/Lintian/Spelling.pm
@@ -0,0 +1,295 @@
+# -*- perl -*-
+# Lintian::Spelling -- Lintian spelling checks shared between multiple scripts
+
+# Copyright (C) 2009 Russ Allbery
+# Copyright (C) 2004 Marc Brockschmidt
+# Copyright (C) 1998 Richard Braakman
+# Copyright (C) 2023 Axel Beckert
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <http://www.gnu.org/licenses/>.
+
+package Lintian::Spelling;
+
+use v5.20;
+use warnings;
+use utf8;
+
+use Exporter qw(import);
+
+our @EXPORT_OK = qw(
+ check_spelling
+ check_spelling_picky
+);
+
+use Carp qw(croak);
+use Const::Fast;
+use Unicode::UTF8 qw(encode_utf8);
+
+const my $SPACE => q{ };
+const my $DOUBLE_QUOTE => q{"};
+
+=head1 NAME
+
+Lintian::Spelling -- Lintian spell checks shared between multiple scripts
+
+=head1 SYNOPSIS
+
+ use Lintian::Spelling qw(check_spelling);
+
+=head1 DESCRIPTION
+
+This module provides functions to do some Lintian checks that need to be
+done in multiple places. There are certain low-level checks, such as
+validating a maintainer name and e-mail address or checking spelling,
+which apply in multiple situations and should be done in multiple checks
+scripts or in checks scripts and the Lintian front-end.
+
+The functions provided by this module issue tags directly, usually either
+taking the tag name to issue as an argument or dynamically constructing
+the tag name based on function parameters. The caller is responsible for
+ensuring that all tags are declared in the relevant *.desc file with
+proper descriptions and other metadata. The possible tags issued by each
+function are described in the documentation for that function.
+
+=head1 FUNCTIONS
+
+=over 4
+
+=item check_spelling(TEXT,[ EXCEPTIONS,] CODEREF)
+
+Performs a spelling check of TEXT. Call CODEREF once for each unique
+misspelling with the following arguments:
+
+=over 4
+
+=item The misspelled word/phrase
+
+=item The correct word/phrase
+
+=back
+
+If EXCEPTIONS is given, it will be used as an array ref of exceptions.
+Any lowercase word appearing as a key of that array will never be
+considered a spelling mistake (exception being if it is a part of a
+multiword misspelling).
+
+Returns the number of spelling mistakes found in TEXT.
+
+=cut
+
+my (%CORRECTIONS, @CORRECTIONS_MULTIWORD);
+
+sub check_spelling {
+ my ($data, $text, $acceptable, $code_ref, $duplicate_check) = @_;
+
+ croak encode_utf8('No spelling data')
+ unless defined $data;
+
+ return 0
+ unless $text;
+
+ if ( !defined $code_ref
+ && defined $acceptable
+ && ref($acceptable) eq 'CODE') {
+ $code_ref = $acceptable;
+ $acceptable = [];
+ }
+
+ $acceptable //= [];
+ $duplicate_check //= 1;
+
+ my %exceptions = map { $_ => 1 } @{$acceptable};
+
+ my (%seen, %duplicates, $last_word, $quoted);
+ my $counter = 0;
+ my $text_orig = $text;
+
+ if (!%CORRECTIONS) {
+ my $corrections_multiword
+ = $data->load('spelling/corrections-multiword', '\|\|');
+ my $corrections = $data->load('spelling/corrections', '\|\|');
+ for my $misspelled ($corrections->all) {
+ $CORRECTIONS{$misspelled} = $corrections->value($misspelled);
+ }
+ for my $misspelled_regex ($corrections_multiword->all) {
+ my $correct = $corrections_multiword->value($misspelled_regex);
+ push(@CORRECTIONS_MULTIWORD,
+ [qr/\b($misspelled_regex)\b/, $correct]);
+ }
+ }
+
+ $text =~ tr/[]//d;
+ # Strip () except for "(s)" suffixes.
+ $text =~ s/(\((?!s\))|(?<!\(s)\))//gi;
+ $text =~ s/(\w-)\s*\n\s*/$1/;
+ $text =~ tr/\r\n \t/ /s;
+ $text =~ s/\s++/ /g;
+
+ # trim both ends
+ $text =~ s/^\s+|\s+$//g;
+
+ for my $word (split($SPACE, $text)) {
+ my $ends_with_punct = 0;
+ my $q = $word =~ tr/"/"/;
+ # Change quoting on "foo or foo" but not "foo".
+ if ($q & 1) {
+ $quoted = not $quoted;
+ }
+ $ends_with_punct = 1 if $word =~ s/[.,;:?!]+$//;
+
+ if ($duplicate_check and defined($last_word) and $last_word eq $word) {
+ # Avoid flagging words inside quoted text.
+ $code_ref->("$word $word (duplicate word)", $word)
+ if not $quoted
+ and not $duplicates{$word}++
+ and not $ends_with_punct
+ and $text_orig !~ /\b$word\s*\($word\b/
+ and $text_orig !~ /\b$word\)\s*$word\b/;
+ }
+
+ if ($word =~ m/^[A-Za-z]+$/ and not $ends_with_punct) {
+ $last_word = $word;
+ } else {
+ $last_word = undef;
+ }
+
+ next
+ if $word =~ /^[A-Z]{1,5}\z/;
+
+ # Some exceptions are based on case (e.g. "teH").
+ next
+ if exists $exceptions{$word};
+
+ my $lcword = lc $word;
+ if (exists $CORRECTIONS{$lcword}
+ && !exists $exceptions{$lcword}) {
+
+ $counter++;
+ my $correction = $CORRECTIONS{$lcword};
+
+ if ($word =~ /^[A-Z]+$/) {
+ $correction = uc $correction;
+ } elsif ($word =~ /^[A-Z]/) {
+ $correction = ucfirst $correction;
+ }
+
+ next
+ if $seen{$lcword}++;
+
+ $code_ref->($word, $correction);
+ }
+ }
+
+ # Special case for correcting multi-word strings.
+ for my $cm (@CORRECTIONS_MULTIWORD) {
+ my ($oregex, $correction) = @{$cm};
+ if ($text =~ $oregex) {
+ my $word = $1;
+ if ($word =~ /^[A-Z]+$/) {
+ $correction = uc $correction;
+ } elsif ($word =~ /^[A-Z]/) {
+ $correction = ucfirst $correction;
+ }
+ $counter++;
+ next if $seen{lc $word}++;
+ $code_ref->(
+ $DOUBLE_QUOTE . $word . $DOUBLE_QUOTE,
+ $DOUBLE_QUOTE . $correction . $DOUBLE_QUOTE
+ );
+ }
+ }
+
+ return $counter;
+}
+
+=item check_spelling_picky(TEXT, CODEREF)
+
+Performs a spelling check of TEXT. Call CODEREF once for each unique
+misspelling with the following arguments:
+
+=over 4
+
+=item The misspelled word/phrase
+
+=item The correct word/phrase
+
+=back
+
+This method performs some pickier corrections - such as checking for common
+capitalization mistakes - which would are not included in check_spelling as
+they are not appropriate for some files, such as changelogs.
+
+Returns the number of spelling mistakes found in TEXT.
+
+=cut
+
+sub check_spelling_picky {
+ my ($data, $text, $code_ref) = @_;
+
+ croak encode_utf8('No spelling data')
+ unless defined $data;
+
+ my %seen;
+ my $counter = 0;
+ my $corrections_case= $data->load('spelling/corrections-case', '\|\|');
+
+ # Check this first in case it's contained in square brackets and
+ # removed below.
+ if ($text =~ /meta\s+package/) {
+ $counter++;
+ $code_ref->('meta package', 'metapackage');
+ }
+
+ # Exclude text enclosed in square brackets as it could be a package list
+ # or similar which may legitimately contain lower-cased versions of
+ # the words.
+ $text =~ s/\[.+?\]//sg;
+ $text =~ tr/\r\n \t/ /s;
+ $text =~ s/\s++/ /g;
+
+ # trim both ends
+ $text =~ s/^\s+|\s+$//g;
+
+ for my $word (split(/\s+/, $text)) {
+ $word =~ s/^\(|[).,?!:;]+$//g;
+ if ($corrections_case->recognizes($word)) {
+ $counter++;
+ next if $seen{$word}++;
+ $code_ref->($word, $corrections_case->value($word));
+ }
+ }
+
+ return $counter;
+}
+
+=back
+
+=head1 AUTHOR
+
+Originally written by Russ Allbery <rra@debian.org> for Lintian. Based on
+code from checks scripts by Marc Brockschmidt and Richard Braakman.
+
+=head1 SEE ALSO
+
+lintian(1)
+
+=cut
+
+1;
+
+# Local Variables:
+# indent-tabs-mode: nil
+# cperl-indent-level: 4
+# End:
+# vim: syntax=perl sw=4 sts=4 sr et