diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-14 13:42:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-14 13:42:30 +0000 |
commit | 75808db17caf8b960b351e3408e74142f4c85aac (patch) | |
tree | 7989e9c09a4240248bf4658a22208a0a52d991c4 /lib/Lintian/Check/Cruft.pm | |
parent | Initial commit. (diff) | |
download | lintian-75808db17caf8b960b351e3408e74142f4c85aac.tar.xz lintian-75808db17caf8b960b351e3408e74142f4c85aac.zip |
Adding upstream version 2.117.0.upstream/2.117.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'lib/Lintian/Check/Cruft.pm')
-rw-r--r-- | lib/Lintian/Check/Cruft.pm | 836 |
1 files changed, 836 insertions, 0 deletions
diff --git a/lib/Lintian/Check/Cruft.pm b/lib/Lintian/Check/Cruft.pm new file mode 100644 index 0000000..1a402c6 --- /dev/null +++ b/lib/Lintian/Check/Cruft.pm @@ -0,0 +1,836 @@ +# cruft -- lintian check script -*- perl -*- +# +# based on debhelper check, +# Copyright (C) 1999 Joey Hess +# Copyright (C) 2000 Sean 'Shaleh' Perry +# Copyright (C) 2002 Josip Rodin +# Copyright (C) 2007 Russ Allbery +# Copyright (C) 2013-2018 Bastien ROUCARIES +# Copyright (C) 2017-2020 Chris Lamb <lamby@debian.org> +# Copyright (C) 2020-2021 Felix Lechner +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, you can find it on the World Wide +# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free +# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, +# MA 02110-1301, USA. + +package Lintian::Check::Cruft; + +use v5.20; +use warnings; +use utf8; + +use Const::Fast; +use List::SomeUtils qw(any none); + +const my $EMPTY => q{}; +const my $ASTERISK => q{*}; +const my $DOT => q{.}; + +const my $ITEM_NOT_FOUND => -1; + +use Moo; +use namespace::clean; + +with 'Lintian::Check'; + +# Half of the size used in the "sliding window" for detecting bad +# licenses like GFDL with invariant sections. +# NB: Keep in sync cruft-gfdl-fp-sliding-win/pre_build. +# not less than 8192 for source missing +use constant BLOCKSIZE => 16_384; +use Lintian::SlidingWindow; + +my %NVIDIA_LICENSE = ( + keywords => [qw{license intellectual retain property}], + sentences =>[ +'retain all intellectual property and proprietary rights in and to this software and related documentation' + ] +); + +my %NON_FREE_LICENSES = ( +# first field is tag +# second field is a list of keywords in lower case +# third field are lower case sentences to match the license. Notes that space are normalized before and formatting removed +# fourth field is a regex to use to match the license, use lower case and [ ] for space. +# 5th field is a function to call if the field 2th to 5th match. +# (see dispatch table %LICENSE_CHECK_DISPATCH_TABLE + + # json license + 'license-problem-json-evil' => { + keywords => [qw{software evil good}], + sentences => ['software shall be used for good'], + regex => +qr{software [ ] shall [ ] be [ ] used [ ] for [ ] good [ ]? ,? [ ]? not [ ] evil}msx + }, + # non free RFC old version + 'license-problem-non-free-RFC' => { + keywords => [qw{document purpose translate language}], + sentences => ['this document itself may not be modified in any way'], + regex => +qr{this [ ] document [ ] itself [ ] may [ ] not [ ] be [ ] modified [ ] in [ ] any [ ] way [ ]?, + [ ]? such [ ] as [ ] by [ ] removing [ ] the [ ] copyright [ ] notice [ ] or [ ] references + [ ] to [ ] .{0,256} [ ]? except [ ] as [ ] needed [ ] for [ ] the [ ] purpose [ ] of [ ] developing + [ ] .{0,128} [ ]? in [ ] which [ ] case [ ] the [ ] procedures [ ] for [ ] copyrights [ ] defined + [ ] in [ ] the [ ] .{0,128} [ ]? process [ ] must [ ] be [ ] followed[ ]?,[ ]? + or [ ] as [ ] required [ ] to [ ] translate [ ] it [ ] into [ ] languages [ ]}msx, + callsub => 'rfc_whitelist_filename' + }, + 'license-problem-non-free-RFC-BCP78' => { + keywords => [qw{license document bcp restriction}], + sentences => ['bcp 78'], + regex => +qr{this [ ] document [ ] is [ ] subject [ ] to [ ] (?:the [ ] rights [ ]?, [ ] licenses [ ] and [ ]restrictions [ ] contained [ ] in [ ])? bcp [ ] 78}msx, + callsub => 'rfc_whitelist_filename' + }, +# check GFDL block - The ".{0,1024}"-part in the regex +# will contain the "no invariants etc." part if +# it is a good use of the license. We include it +# here to ensure that we do not emit a false positive +# if the "redeeming" part is in the next block +# keyword document is here in order to benefit for other license keyword and a shortcut for documentation + 'license-problem-gfdl-invariants' => { + keywords => [qw{license document gnu copy documentation}], + sentences => ['gnu free documentation license'], + regex => +qr{(?'rawcontextbefore'(?:(?:(?!a [ ] copy [ ] of [ ] the [ ] license [ ] is).){1024}| +\A(?:(?!a [ ] copy [ ] of [ ] the [ ] license [ ] is).){0,1024}| +(?:[ ] copy [ ] of [ ] the [ ] license [ ] is.{0,1024}?))) gnu [ ] free [ ] +documentation [ ] license (?'rawgfdlsections'(?:(?!gnu [ ] free [ ] documentation +[ ] license).){0,1024}?) (?:a [ ] copy [ ] of [ ] the [ ] license [ ] is| +this [ ] document [ ] is [ ] distributed)}msx, + callsub => 'check_gfdl_license_problem' + }, + # php license + 'license-problem-php-license' => { + keywords => [qw{www.php.net group\@php.net phpfoo conjunction php}], + sentences => ['this product includes php'], + regex => qr{php [ ] license [ ]?[,;][ ]? version [ ] 3(?:\.\d+)?}msx, + callsub => 'php_source_whitelist' + }, + 'license-problem-bad-php-license' => { + keywords => [qw{www.php.net add-on conjunction}], + sentences => ['this product includes php'], + regex => qr{php [ ] license [ ]?[,;][ ]? version [ ] 2(?:\.\d+)?}msx, + callsub => 'php_source_whitelist' + }, + # cc by nc sa note that " is replaced by [ ] + 'license-problem-cc-by-nc-sa' => { + keywords => [qw{license by-nc-sa creativecommons.org}], + sentences => [ + '://creativecommons.org/licenses/by-nc-sa', + 'under attribution-noncommercial' + ], + regex => +qr{(?:license [ ] rdf:[^=:]+=[ ]* (?:ht|f)tps?://(?:[^/.]\.)??creativecommons\.org/licenses/by-nc-sa/\d+(?:\.\d+)?(?:/[[:alpha:]]+)?/? [ ]* >|available [ ] under [ ] attribution-noncommercial)}msx + }, + # not really a license but warn it: visual c++ generated file + 'source-contains-autogenerated-visual-c++-file' => { + keywords => [qw{microsoft visual generated}], + sentences => ['microsoft visual c++ generated'], + regex => +qr{microsoft [ ] visual [ ] c[+][+] [ ] generated (?![ ] by [ ] freeze\.py)}msx + }, + # not really a license but warn about it: gperf generated file + 'source-contains-autogenerated-gperf-data' => { + keywords => [qw{code produced gperf version}], + sentences => ['code produced by gperf version'], + regex => + qr{code [ ] produced [ ] by [ ] gperf [ ] version [ ] \d+\.\d+}msx + }, + # warn about copy of ieee-data + 'source-contains-data-from-ieee-data-oui-db' => { + keywords => [qw{struck scitex racore}], + sentences => ['dr. b. struck'], + regex => qr{dr. [ ] b. [ ] struck}msx + }, + # warn about unicode license for utf for convert utf + 'license-problem-convert-utf-code' => { + keywords => [qw{fall-through bytestowrite utf-8}], + sentences => ['the fall-through switches in utf-8 reading'], + regex => +qr{the [ ] fall-through [ ] switches [ ] in [ ] utf-8 [ ] reading [ ] code [ ] save}msx + } +); + +# get usual data about admissible/not admissible GFDL invariant part of license +has GFDL_FRAGMENTS => ( + is => 'rw', + lazy => 1, + default => sub { + my ($self) = @_; + + my %gfdl_fragments; + + my $data = $self->data->load('cruft/gfdl-license-fragments-checks', + qr/\s*\~\~\s*/); + + for my $gfdlsectionsregex ($data->all) { + + my $secondpart = $data->value($gfdlsectionsregex); + + # allow empty parameters + $secondpart //= $EMPTY; + my ($acceptonlyinfile,$applytag) + = split(/\s*\~\~\s*/, $secondpart, 2); + + $acceptonlyinfile //= $EMPTY; + $applytag //= $EMPTY; + + # trim both ends + $acceptonlyinfile =~ s/^\s+|\s+$//g; + $applytag =~ s/^\s+|\s+$//g; + + # accept all files if empty + $acceptonlyinfile ||= $DOT . $ASTERISK; + + my %ret = ( + 'gfdlsectionsregex' => qr/$gfdlsectionsregex/xis, + 'acceptonlyinfile' => qr/$acceptonlyinfile/xs, + ); + + $ret{'tag'} = $applytag + if length $applytag; + + $gfdl_fragments{$gfdlsectionsregex} = \%ret; + } + + return \%gfdl_fragments; + } +); + +sub visit_patched_files { + my ($self, $item) = @_; + + return + unless $item->is_file; + + # license string in debian/changelog are probably just change + # Ignore these strings in d/README.{Debian,source}. If they + # appear there it is probably just "file XXX got removed + # because of license Y". + $self->full_text_check($item) + unless $item->name eq 'debian/changelog' + && $item->name eq 'debian/README.Debian' + && $item->name eq 'debian/README.source'; + + return; +} + +# do basic license check against well known offender +# note that it does not replace licensecheck(1) +# and is only used for autoreject by ftp-master +sub full_text_check { + my ($self, $item) = @_; + + return undef + unless $item ->is_regular_file; + + open(my $fd, '<:raw', $item->unpacked_path) + or die encode_utf8('Cannot open ' . $item->unpacked_path); + + my $sfd = Lintian::SlidingWindow->new; + $sfd->handle($fd); + $sfd->blocksize(BLOCKSIZE); + $sfd->blocksub(sub { $_ = lc; }); + + unless (-T $fd) { + close($fd); + return undef; + } + + # we try to read this file in block and use a sliding window + # for efficiency. We store two blocks in @queue and the whole + # string to match in $block. Please emit license tags only once + # per file + BLOCK: + while (my $lowercase = $sfd->readwindow()) { + + my $blocknumber = $sfd->blocknumber(); + + my $clean = clean_text($lowercase); + + # Check for non-distributable files - this + # applies even to non-free, as we still need + # permission to distribute those. + # nvdia opencv infamous license + last BLOCK + if $self->check_for_single_bad_license($item, $lowercase, $clean, + 'license-problem-nvidia-intellectual', + \%NVIDIA_LICENSE); + + unless ($self->processable->is_non_free) { + + for my $tag_name (keys %NON_FREE_LICENSES) { + + last BLOCK + if $self->check_for_single_bad_license($item, $lowercase, + $clean,$tag_name, $NON_FREE_LICENSES{$tag_name}); + } + } + + # check javascript in html file + if ($item->basename =~ /\.(?:x?html?\d?|xht)$/i) { + + my $blockscript = $lowercase; + my $indexscript; + + while (($indexscript = index($blockscript, '<script')) + > $ITEM_NOT_FOUND){ + + $blockscript = substr($blockscript,$indexscript); + + # sourced script ok + if ($blockscript =~ m{\A<script\s+[^>]*?src="[^"]+?"[^>]*?>}sm) + { + + $blockscript = substr($blockscript,$+[0]); + next; + } + + # extract script + if ($blockscript =~ m{<script[^>]*?>(.*?)</script>}sm) { + + $blockscript = substr($blockscript,$+[0]); + + my $lcscript = $1; + + # check if js script is minified + my $firstline = $EMPTY; + for my $line (split /\n/, $lcscript) { + + if ($line =~ /^\s*$/) { + next; + + } else { + $firstline = $line; + last; + } + } + + if ($firstline + =~ m/.{0,20}((?:\bcopyright\b|[\(]c[\)]\s*\w|\N{COPYRIGHT SIGN}).{0,50})/ + ){ + + my $extract = $1; + $extract =~ s/^\s+|\s+$//g; + + $self->pointed_hint( + 'embedded-script-includes-copyright-statement', + $item->pointer, + 'extract of copyright statement:', + $extract + ); + } + + # clean up jslint craps line + my $cleaned = $lcscript; + $cleaned =~ s{^\s*/[*][^\n]*[*]/\s*$}{}gm; + $cleaned =~ s{^\s*//[^\n]*$}{}gm; + $cleaned =~ s/^\s+//gm; + + # strip indentation + $cleaned =~ s/^\s+//mg; + $cleaned = _strip_c_comments($cleaned); + # strip empty line + $cleaned =~ s/^\s*\n//mg; + # remove last \n + $cleaned =~ s/\n\Z//m; + +# detect browserified javascript (comment are removed here and code is stripped) + my $contiguous = $cleaned; + $contiguous =~ s/\n/ /msg; + + # get browserified regexp + my $BROWSERIFY_REGEX + = $self->data->load('cruft/browserify-regex', + qr/\s*\~\~\s*/); + + for my $condition ($BROWSERIFY_REGEX->all) { + + my $pattern = $BROWSERIFY_REGEX->value($condition); + if ($contiguous =~ m{$pattern}msx) { + + my $extra + = (defined $1) ? 'code fragment:'.$1 : $EMPTY; + $self->pointed_hint( + 'source-contains-browserified-javascript', + $item->pointer, $extra); + + last; + } + } + + next; + } + + last; + } + } + + # check if file is javascript but not minified + my $isjsfile = ($item->name =~ m/\.js$/) ? 1 : 0; + if ($isjsfile) { + my $minjsregexp + = qr/(?i)[-._](?:compiled|compressed|lite|min|pack(?:ed)?|prod|umd|yc)\.js$/; + $isjsfile = ($item->name =~ m{$minjsregexp}) ? 0 : 1; + } + + if ($isjsfile) { + # exception sphinx documentation + if ($item->basename eq 'searchindex.js') { + if ($lowercase =~ m/\A\s*search\.setindex\s* \s* \(\s*\{/xms) { + + $self->pointed_hint( + 'source-contains-prebuilt-sphinx-documentation', + $item->parent_dir->pointer); + last BLOCK; + } + } + + if ($item->basename eq 'search_index.js') { + if ($lowercase =~ m/\A\s*var\s*search_index\s*=/xms) { + + $self->pointed_hint( + 'source-contains-prebuilt-pandoc-documentation', + $item->parent_dir->pointer); + last BLOCK; + } + } + # false positive in dx package at least + elsif ($item->basename eq 'srchidx.js') { + + last BLOCK + if $lowercase + =~ m/\A\s*profiles \s* = \s* new \s* Array\s*\(/xms; + } + # https://github.com/rafaelp/css_browser_selector is actually the + # original source. (#874381) + elsif ($lowercase =~ m/css_browser_selector\(/) { + + last BLOCK; + } + # Avoid false-positives in Jush's syntax highlighting definition files. + elsif ($lowercase =~ m/jush\.tr\./) { + + last BLOCK; + } + + # now search hidden minified + + # clean up jslint craps line + my $cleaned = $lowercase; + $cleaned =~ s{^\s*/[*][^\n]*[*]/\s*$}{}gm; + $cleaned =~ s{^\s*//[^\n]*$}{}gm; + $cleaned =~ s/^\s+//gm; + + # strip indentation + $cleaned =~ s/^\s+//mg; + $cleaned = _strip_c_comments($cleaned); + # strip empty line + $cleaned =~ s/^\s*\n//mg; + # remove last \n + $cleaned =~ s/\n\Z//m; + +# detect browserified javascript (comment are removed here and code is stripped) + my $contiguous = $cleaned; + $contiguous =~ s/\n/ /msg; + + # get browserified regexp + my $BROWSERIFY_REGEX + = $self->data->load('cruft/browserify-regex',qr/\s*\~\~\s*/); + + for my $condition ($BROWSERIFY_REGEX->all) { + + my $pattern = $BROWSERIFY_REGEX->value($condition); + if ($contiguous =~ m{$pattern}msx) { + + my $extra = (defined $1) ? 'code fragment:'.$1 : $EMPTY; + $self->pointed_hint( + 'source-contains-browserified-javascript', + $item->pointer, $extra); + + last; + } + } + } + + # search link rel header + if ($lowercase =~ / \Q rel="copyright" \E /msx) { + + my $href = $lowercase; + $href =~ m{<link \s+ + rel="copyright" \s+ + href="([^"]+)" \s*/? \s*>}xmsi; + + my $url = $1 // $EMPTY; + + $self->pointed_hint('license-problem-cc-by-nc-sa', $item->pointer) + if $url =~ m{^https?://creativecommons.org/licenses/by-nc-sa/}; + } + last BLOCK; + } + return close($fd); +} + +# strip C comment +# warning block is at more 8192 char in order to be too slow +# and in order to avoid regex recursion +sub _strip_c_comments { + my ($lowercase) = @_; + + # from perl faq strip comments + $lowercase =~ s{ + # Strip /* */ comments + /\* [^*]*+ \*++ (?: [^/*][^*]*+\*++ ) */ + # Strip // comments (C++ style) + | // (?: [^\\] | [^\n][\n]? )*? (?=\n) + | ( + # Keep "/* */" (etc) as is + "(?: \\. | [^"\\]++)*" + # Keep '/**/' (etc) as is + | '(?: \\. | [^'\\]++)*' + # Keep anything else + | .[^/"'\\]*+ + ) + }{defined $1 ? $1 : ""}xgse; + + return $lowercase; +} + +# return True in case of license problem +sub check_gfdl_license_problem { + my ($self, $item, $tag_name, %matchedhash) = @_; + + my $rawgfdlsections = $matchedhash{rawgfdlsections} || $EMPTY; + my $rawcontextbefore = $matchedhash{rawcontextbefore} || $EMPTY; + + # strip punctuation + my $gfdlsections = _strip_punct($rawgfdlsections); + my $contextbefore = _strip_punct($rawcontextbefore); + + # remove line number at beginning of line + # see krusader/1:2.4.0~beta3-2/doc/en_US/advanced-functions.docbook/ + $gfdlsections =~ s{[ ]\d+[ ]}{ }gxsmo; + $gfdlsections =~ s{^\d+[ ]}{ }xsmo; + $gfdlsections =~ s{[ ]\d+$}{ }xsmo; + $gfdlsections =~ s{[ ]+}{ }xsmo; + + # remove classical and without meaning part of + # matched string + my $oldgfdlsections; + do { + $oldgfdlsections = $gfdlsections; + $gfdlsections =~ s{ \A \(?[ ]? g?fdl [ ]?\)?[ ]? [,\.;]?[ ]?}{}xsmo; + $gfdlsections =~ s{ \A (?:either[ ])? + version [ ] \d+(?:\.\d+)? [ ]?}{}xsmo; + $gfdlsections =~ s{ \A of [ ] the [ ] license [ ]?[,\.;][ ]?}{}xsmo; + $gfdlsections=~ s{ \A or (?:[ ]\(?[ ]? at [ ] your [ ] option [ ]?\)?)? + [ ] any [ ] later [ ] version[ ]?}{}xsmo; + $gfdlsections =~ s{ \A (as[ ])? published [ ] by [ ] + the [ ] free [ ] software [ ] foundation[ ]?}{}xsmo; + $gfdlsections =~ s{\(?[ ]? fsf [ ]?\)?[ ]?}{}xsmo; + $gfdlsections =~ s{\A [ ]? [,\.;]? [ ]?}{}xsmo; + $gfdlsections =~ s{[ ]? [,\.]? [ ]?\Z}{}xsmo; + } while ($oldgfdlsections ne $gfdlsections); + + $contextbefore =~ s{ + [ ]? (:?[,\.;]? [ ]?)? + permission [ ] is [ ] granted [ ] to [ ] copy [ ]?[,\.;]?[ ]? + distribute [ ]?[,\.;]?[ ]? and[ ]?/?[ ]?or [ ] modify [ ] + this [ ] document [ ] under [ ] the [ ] terms [ ] of [ ] the\Z}{}xsmo; + + # Treat ambiguous empty text + if ($gfdlsections eq $EMPTY) { + + # lie in order to check more part + $self->pointed_hint('license-problem-gfdl-invariants-empty', + $item->pointer); + + return 0; + } + + # official wording + if( + $gfdlsections =~ m{\A + with [ ] no [ ] invariant [ ] sections[ ]?, + [ ]? no [ ] front(?:[ ]?-[ ]?|[ ])cover [ ] texts[ ]?,? + [ ]? and [ ] no [ ] back(?:[ ]?-?[ ]?|[ ])cover [ ] texts + \Z}xs + ) { + return 0; + } + + # example are ok + if ( + $contextbefore =~ m{following [ ] is [ ] an [ ] example + (:?[ ] of [ ] the [ ] license [ ] notice [ ] to [ ] use + (?:[ ] after [ ] the [ ] copyright [ ] (?:line(?:\(s\)|s)?)? + (?:[ ] using [ ] all [ ] the [ ] features? [ ] of [ ] the [ ] gfdl)? + )? + )? [ ]? [,:]? \Z}xs + ){ + return 0; + } + + # GFDL license, assume it is bad unless it + # explicitly states it has no "bad sections". + for my $gfdl_fragment (keys %{$self->GFDL_FRAGMENTS}) { + + my $gfdl_data = $self->GFDL_FRAGMENTS->{$gfdl_fragment}; + my $gfdlsectionsregex = $gfdl_data->{'gfdlsectionsregex'}; + if ($gfdlsections =~ m{$gfdlsectionsregex}) { + + my $acceptonlyinfile = $gfdl_data->{'acceptonlyinfile'}; + if ($item->name =~ m{$acceptonlyinfile}) { + + my $applytag = $gfdl_data->{'tag'}; + + # lie will allow checking more blocks + $self->pointed_hint($applytag, $item->pointer, + 'invariant part is:', + $gfdlsections) + if defined $applytag; + + return 0; + + } else { + $self->pointed_hint( + 'license-problem-gfdl-invariants', + $item->pointer,'invariant part is:', + $gfdlsections + ); + return 1; + } + } + } + + # catch all + $self->pointed_hint( + 'license-problem-gfdl-invariants', + $item->pointer,'invariant part is:', + $gfdlsections + ); + + return 1; +} + +sub rfc_whitelist_filename { + my ($self, $item, $tag_name, %matchedhash) = @_; + + return 0 + if $item->name eq 'debian/copyright'; + + my $lcname = lc($item->basename); + + # prebuilt-file or forbidden file type + # specified separator protects against spaces in pattern + my $RFC_WHITELIST= $self->data->load('cruft/rfc-whitelist',qr/\s*\~\~\s*/); + + my @patterns = $RFC_WHITELIST->all; + + return 0 + if any { $lcname =~ m/ $_ /xms } @patterns; + + $self->pointed_hint($tag_name, $item->pointer); + + return 1; +} + +sub php_source_whitelist { + my ($self, $item, $tag_name, %matchedhash) = @_; + + my $copyright_path + = $self->processable->patched->resolve_path('debian/copyright'); + + return 0 + if defined $copyright_path + && $copyright_path->bytes + =~ m{^Source: https?://(pecl|pear).php.net/package/.*$}m; + + return 0 + if $self->processable->source_name =~ /^php\d*(?:\.\d+)?$/xms; + + $self->pointed_hint($tag_name, $item->pointer); + + return 1; +} + +sub clean_text { + my ($text) = @_; + + # be paranoiac replace gnu with texinfo by gnu + $text =~ s{ + (?:@[[:alpha:]]*?\{)?\s*gnu\s*\} # Texinfo cmd + }{ gnu }gxms; + + # pod2man formatting + $text =~ s{ \\ \* \( [LR] \" }{\"}gxsm; + $text =~ s{ \\ -}{-}gxsm; + + # replace some shortcut (clisp) + $text =~ s{\(&fdl;\)}{ }gxsm; + $text =~ s{&fsf;}{free software foundation}gxsm; + + # non breaking space + $text =~ s{ }{ }gxsm; + + # replace some common comment-marker/markup with space + $text =~ s{^\.\\\"}{ }gxms; # man comments + + # po comment may include html tag + $text =~ s/\"\s?\v\#~\s?\"//gxms; + + # strip .rtf paragraph marks (#892967) + $text =~ s/\\par\b//gxms; + + $text =~ s/\\url[{][^}]*?[}]/ /gxms; # (la)?tex url + $text =~ s/\\emph[{]/ /gxms; # (la)?tex emph + $text =~ s<\\href[{][^}]*?[}] + [{]([^}]*?)[}]>< $1 >gxms;# (la)?tex href + $text =~ s<\\hyperlink + [{][^}]*?[}] + [{]([^}]*?)[}]>< $1 >gxms; # (la)?tex hyperlink + $text =~ s{-\\/}{-}gxms; # tex strange hyphen + $text =~ s/\\char/ /gxms; # tex char command + + # Texinfo comment with end section + $text =~ s{\@c(?:omment)?\h+ + end \h+ ifman\s+}{ }gxms; + $text =~ s{\@c(?:omment)?\s+ + noman\s+}{ }gxms; # Texinfo comment no manual + + $text =~ s/\@c(?:omment)?\s+/ /gxms; # Texinfo comment + + # Texinfo bold,italic, roman, fixed width + $text =~ s/\@[birt][{]/ /gxms; + $text =~ s/\@sansserif[{]/ /gxms; # Texinfo sans serif + $text =~ s/\@slanted[{]/ /gxms; # Texinfo slanted + $text =~ s/\@var[{]/ /gxms; # Texinfo emphasis + + $text =~ s/\@(?:small)?example\s+/ /gxms; # Texinfo example + $text =~ s{\@end \h+ + (?:small)example\s+}{ }gxms; # Texinfo end example tag + $text =~ s/\@group\s+/ /gxms; # Texinfo group + $text =~ s/\@end\h+group\s+/ /gxms; # Texinfo end group + + $text =~ s/<!--/ /gxms; # XML comments + $text =~ s/-->/ /gxms; # end XML comment + + $text =~ s{</?a[^>]*?>}{ }gxms; # a link + $text =~ s{<br\s*/?>}{ }gxms; # (X)?HTML line + # breaks + $text =~ s{</?citetitle[^>]*?>}{ }gxms; # DocBook citation title + $text =~ s{</?div[^>]*?>}{ }gxms; # html style + $text =~ s{</?font[^>]*?>}{ }gxms; # bold + $text =~ s{</?b[^>]*?>}{ }gxms; # italic + $text =~ s{</?i[^>]*?>}{ }gxms; # italic + $text =~ s{</?link[^>]*?>}{ }gxms; # xml link + $text =~ s{</?p[^>]*?>}{ }gxms; # html paragraph + $text =~ s{</?quote[^>]*?>}{ }gxms; # xml quote + $text =~ s{</?span[^>]*?>}{ }gxms; # span tag + $text =~ s{</?ulink[^>]*?>}{ }gxms; # ulink DocBook + $text =~ s{</?var[^>]*?>}{ }gxms; # var used by texinfo2html + + $text =~ s{\&[lr]dquo;}{ }gxms; # html rquote + + $text =~ s{\(\*note.*?::\)}{ }gxms; # info file note + + # String array (e.g. "line1",\n"line2") + $text =~ s/\"\s*,/ /gxms; + # String array (e.g. "line1"\n ,"line2"), + $text =~ s/,\s*\"/ /gxms; + $text =~ s/\\n/ /gxms; # Verbatim \n in string array + + $text =~ s/\\&/ /gxms; # pod2man formatting + $text =~ s/\\s(?:0|-1)/ /gxms; # pod2man formatting + + $text =~ s/(?:``|'')/ /gxms; # quote like + + # diff/patch lines (should be after html tag) + $text =~ s/^[-\+!<>]/ /gxms; + $text =~ s{\@\@ \s* + [-+] \d+,\d+ \s+ + [-+] \d+,\d+ \s* + \@\@}{ }gxms; # patch line + + # Texinfo end tag (could be more clever but brute force is fast) + $text =~ s/}/ /gxms; + # Tex section titles + $text =~ s/^\s*\\(sub)*section\*?\{\s*\S+/ /gxms; + # single char at end + # String, C-style comment/javadoc indent, + # quotes for strings, pipe and backslash, tilde in some txt + $text =~ s/[%\*\"\|\\\#~]/ /gxms; + # delete double spacing now and normalize spacing + # to space character + $text =~ s{\s++}{ }gsm; + + # trim both ends + $text =~ s/^\s+|\s+$//g; + + return $text; +} + +# do not use space around punctuation +sub _strip_punct() { + my ($text) = @_; + # replace final punctuation + $text =~ s{(?: + \s*[,\.;]\s*\Z | # final punctuation + \A\s*[,\.;]\s* # punctuation at the beginning + )}{ }gxms; + + # delete double spacing now and normalize spacing + # to space character + $text =~ s{\s++}{ }gsm; + + # trim both ends + $text =~ s/^\s+|\s+$//g; + + return $text; +} + +sub check_for_single_bad_license { + my ($self, $item, $lowercase, $clean, $tag_name, $license_data) = @_; + + # do fast keyword search + # could make more sense as 'return 1 unless all' but does not work + return 0 + if none { $lowercase =~ / \Q$_\E /msx } @{$license_data->{keywords}}; + + return 0 + if none { $clean =~ / \Q$_\E /msx }@{$license_data->{sentences}}; + + my $regex = $license_data->{regex}; + return 0 + if defined $regex && $clean !~ $regex; + + my $callsub = $license_data->{callsub}; + if (!defined $callsub) { + + $self->pointed_hint($tag_name, $item->pointer); + return 1; + } + + return $self->$callsub($item, $tag_name, %+); +} + +1; + +# Local Variables: +# indent-tabs-mode: nil +# cperl-indent-level: 4 +# End: +# vim: syntax=perl sw=4 sts=4 sr et |