diff options
Diffstat (limited to 'lib/Lintian/Spelling.pm')
-rw-r--r-- | lib/Lintian/Spelling.pm | 295 |
1 files changed, 295 insertions, 0 deletions
diff --git a/lib/Lintian/Spelling.pm b/lib/Lintian/Spelling.pm new file mode 100644 index 0000000..1a8d7fc --- /dev/null +++ b/lib/Lintian/Spelling.pm @@ -0,0 +1,295 @@ +# -*- perl -*- +# Lintian::Spelling -- Lintian spelling checks shared between multiple scripts + +# Copyright (C) 2009 Russ Allbery +# Copyright (C) 2004 Marc Brockschmidt +# Copyright (C) 1998 Richard Braakman +# Copyright (C) 2023 Axel Beckert +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see <http://www.gnu.org/licenses/>. + +package Lintian::Spelling; + +use v5.20; +use warnings; +use utf8; + +use Exporter qw(import); + +our @EXPORT_OK = qw( + check_spelling + check_spelling_picky +); + +use Carp qw(croak); +use Const::Fast; +use Unicode::UTF8 qw(encode_utf8); + +const my $SPACE => q{ }; +const my $DOUBLE_QUOTE => q{"}; + +=head1 NAME + +Lintian::Spelling -- Lintian spell checks shared between multiple scripts + +=head1 SYNOPSIS + + use Lintian::Spelling qw(check_spelling); + +=head1 DESCRIPTION + +This module provides functions to do some Lintian checks that need to be +done in multiple places. There are certain low-level checks, such as +validating a maintainer name and e-mail address or checking spelling, +which apply in multiple situations and should be done in multiple checks +scripts or in checks scripts and the Lintian front-end. + +The functions provided by this module issue tags directly, usually either +taking the tag name to issue as an argument or dynamically constructing +the tag name based on function parameters. The caller is responsible for +ensuring that all tags are declared in the relevant *.desc file with +proper descriptions and other metadata. The possible tags issued by each +function are described in the documentation for that function. + +=head1 FUNCTIONS + +=over 4 + +=item check_spelling(TEXT,[ EXCEPTIONS,] CODEREF) + +Performs a spelling check of TEXT. Call CODEREF once for each unique +misspelling with the following arguments: + +=over 4 + +=item The misspelled word/phrase + +=item The correct word/phrase + +=back + +If EXCEPTIONS is given, it will be used as an array ref of exceptions. +Any lowercase word appearing as a key of that array will never be +considered a spelling mistake (exception being if it is a part of a +multiword misspelling). + +Returns the number of spelling mistakes found in TEXT. + +=cut + +my (%CORRECTIONS, @CORRECTIONS_MULTIWORD); + +sub check_spelling { + my ($data, $text, $acceptable, $code_ref, $duplicate_check) = @_; + + croak encode_utf8('No spelling data') + unless defined $data; + + return 0 + unless $text; + + if ( !defined $code_ref + && defined $acceptable + && ref($acceptable) eq 'CODE') { + $code_ref = $acceptable; + $acceptable = []; + } + + $acceptable //= []; + $duplicate_check //= 1; + + my %exceptions = map { $_ => 1 } @{$acceptable}; + + my (%seen, %duplicates, $last_word, $quoted); + my $counter = 0; + my $text_orig = $text; + + if (!%CORRECTIONS) { + my $corrections_multiword + = $data->load('spelling/corrections-multiword', '\|\|'); + my $corrections = $data->load('spelling/corrections', '\|\|'); + for my $misspelled ($corrections->all) { + $CORRECTIONS{$misspelled} = $corrections->value($misspelled); + } + for my $misspelled_regex ($corrections_multiword->all) { + my $correct = $corrections_multiword->value($misspelled_regex); + push(@CORRECTIONS_MULTIWORD, + [qr/\b($misspelled_regex)\b/, $correct]); + } + } + + $text =~ tr/[]//d; + # Strip () except for "(s)" suffixes. + $text =~ s/(\((?!s\))|(?<!\(s)\))//gi; + $text =~ s/(\w-)\s*\n\s*/$1/; + $text =~ tr/\r\n \t/ /s; + $text =~ s/\s++/ /g; + + # trim both ends + $text =~ s/^\s+|\s+$//g; + + for my $word (split($SPACE, $text)) { + my $ends_with_punct = 0; + my $q = $word =~ tr/"/"/; + # Change quoting on "foo or foo" but not "foo". + if ($q & 1) { + $quoted = not $quoted; + } + $ends_with_punct = 1 if $word =~ s/[.,;:?!]+$//; + + if ($duplicate_check and defined($last_word) and $last_word eq $word) { + # Avoid flagging words inside quoted text. + $code_ref->("$word $word (duplicate word)", $word) + if not $quoted + and not $duplicates{$word}++ + and not $ends_with_punct + and $text_orig !~ /\b$word\s*\($word\b/ + and $text_orig !~ /\b$word\)\s*$word\b/; + } + + if ($word =~ m/^[A-Za-z]+$/ and not $ends_with_punct) { + $last_word = $word; + } else { + $last_word = undef; + } + + next + if $word =~ /^[A-Z]{1,5}\z/; + + # Some exceptions are based on case (e.g. "teH"). + next + if exists $exceptions{$word}; + + my $lcword = lc $word; + if (exists $CORRECTIONS{$lcword} + && !exists $exceptions{$lcword}) { + + $counter++; + my $correction = $CORRECTIONS{$lcword}; + + if ($word =~ /^[A-Z]+$/) { + $correction = uc $correction; + } elsif ($word =~ /^[A-Z]/) { + $correction = ucfirst $correction; + } + + next + if $seen{$lcword}++; + + $code_ref->($word, $correction); + } + } + + # Special case for correcting multi-word strings. + for my $cm (@CORRECTIONS_MULTIWORD) { + my ($oregex, $correction) = @{$cm}; + if ($text =~ $oregex) { + my $word = $1; + if ($word =~ /^[A-Z]+$/) { + $correction = uc $correction; + } elsif ($word =~ /^[A-Z]/) { + $correction = ucfirst $correction; + } + $counter++; + next if $seen{lc $word}++; + $code_ref->( + $DOUBLE_QUOTE . $word . $DOUBLE_QUOTE, + $DOUBLE_QUOTE . $correction . $DOUBLE_QUOTE + ); + } + } + + return $counter; +} + +=item check_spelling_picky(TEXT, CODEREF) + +Performs a spelling check of TEXT. Call CODEREF once for each unique +misspelling with the following arguments: + +=over 4 + +=item The misspelled word/phrase + +=item The correct word/phrase + +=back + +This method performs some pickier corrections - such as checking for common +capitalization mistakes - which would are not included in check_spelling as +they are not appropriate for some files, such as changelogs. + +Returns the number of spelling mistakes found in TEXT. + +=cut + +sub check_spelling_picky { + my ($data, $text, $code_ref) = @_; + + croak encode_utf8('No spelling data') + unless defined $data; + + my %seen; + my $counter = 0; + my $corrections_case= $data->load('spelling/corrections-case', '\|\|'); + + # Check this first in case it's contained in square brackets and + # removed below. + if ($text =~ /meta\s+package/) { + $counter++; + $code_ref->('meta package', 'metapackage'); + } + + # Exclude text enclosed in square brackets as it could be a package list + # or similar which may legitimately contain lower-cased versions of + # the words. + $text =~ s/\[.+?\]//sg; + $text =~ tr/\r\n \t/ /s; + $text =~ s/\s++/ /g; + + # trim both ends + $text =~ s/^\s+|\s+$//g; + + for my $word (split(/\s+/, $text)) { + $word =~ s/^\(|[).,?!:;]+$//g; + if ($corrections_case->recognizes($word)) { + $counter++; + next if $seen{$word}++; + $code_ref->($word, $corrections_case->value($word)); + } + } + + return $counter; +} + +=back + +=head1 AUTHOR + +Originally written by Russ Allbery <rra@debian.org> for Lintian. Based on +code from checks scripts by Marc Brockschmidt and Richard Braakman. + +=head1 SEE ALSO + +lintian(1) + +=cut + +1; + +# Local Variables: +# indent-tabs-mode: nil +# cperl-indent-level: 4 +# End: +# vim: syntax=perl sw=4 sts=4 sr et |