# cruft -- lintian check script -*- perl -*- # # based on debhelper check, # Copyright (C) 1999 Joey Hess # Copyright (C) 2000 Sean 'Shaleh' Perry # Copyright (C) 2002 Josip Rodin # Copyright (C) 2007 Russ Allbery # Copyright (C) 2013-2018 Bastien ROUCARIES # Copyright (C) 2017-2020 Chris Lamb # Copyright (C) 2020-2021 Felix Lechner # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, you can find it on the World Wide # Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301, USA. package Lintian::Check::Cruft; use v5.20; use warnings; use utf8; use Const::Fast; use List::SomeUtils qw(any none); const my $EMPTY => q{}; const my $ASTERISK => q{*}; const my $DOT => q{.}; const my $ITEM_NOT_FOUND => -1; use Moo; use namespace::clean; with 'Lintian::Check'; # Half of the size used in the "sliding window" for detecting bad # licenses like GFDL with invariant sections. # NB: Keep in sync cruft-gfdl-fp-sliding-win/pre_build. # not less than 8192 for source missing use constant BLOCKSIZE => 16_384; use Lintian::SlidingWindow; my %NVIDIA_LICENSE = ( keywords => [qw{license intellectual retain property}], sentences =>[ 'retain all intellectual property and proprietary rights in and to this software and related documentation' ] ); my %NON_FREE_LICENSES = ( # first field is tag # second field is a list of keywords in lower case # third field are lower case sentences to match the license. Notes that space are normalized before and formatting removed # fourth field is a regex to use to match the license, use lower case and [ ] for space. # 5th field is a function to call if the field 2th to 5th match. # (see dispatch table %LICENSE_CHECK_DISPATCH_TABLE # json license 'license-problem-json-evil' => { keywords => [qw{software evil good}], sentences => ['software shall be used for good'], regex => qr{software [ ] shall [ ] be [ ] used [ ] for [ ] good [ ]? ,? [ ]? not [ ] evil}msx }, # non free RFC old version 'license-problem-non-free-RFC' => { keywords => [qw{document purpose translate language}], sentences => ['this document itself may not be modified in any way'], regex => qr{this [ ] document [ ] itself [ ] may [ ] not [ ] be [ ] modified [ ] in [ ] any [ ] way [ ]?, [ ]? such [ ] as [ ] by [ ] removing [ ] the [ ] copyright [ ] notice [ ] or [ ] references [ ] to [ ] .{0,256} [ ]? except [ ] as [ ] needed [ ] for [ ] the [ ] purpose [ ] of [ ] developing [ ] .{0,128} [ ]? in [ ] which [ ] case [ ] the [ ] procedures [ ] for [ ] copyrights [ ] defined [ ] in [ ] the [ ] .{0,128} [ ]? process [ ] must [ ] be [ ] followed[ ]?,[ ]? or [ ] as [ ] required [ ] to [ ] translate [ ] it [ ] into [ ] languages [ ]}msx, callsub => 'rfc_whitelist_filename' }, 'license-problem-non-free-RFC-BCP78' => { keywords => [qw{license document bcp restriction}], sentences => ['bcp 78'], regex => qr{this [ ] document [ ] is [ ] subject [ ] to [ ] (?:the [ ] rights [ ]?, [ ] licenses [ ] and [ ]restrictions [ ] contained [ ] in [ ])? bcp [ ] 78}msx, callsub => 'rfc_whitelist_filename' }, # check GFDL block - The ".{0,1024}"-part in the regex # will contain the "no invariants etc." part if # it is a good use of the license. We include it # here to ensure that we do not emit a false positive # if the "redeeming" part is in the next block # keyword document is here in order to benefit for other license keyword and a shortcut for documentation 'license-problem-gfdl-invariants' => { keywords => [qw{license document gnu copy documentation}], sentences => ['gnu free documentation license'], regex => qr{(?'rawcontextbefore'(?:(?:(?!a [ ] copy [ ] of [ ] the [ ] license [ ] is).){1024}| \A(?:(?!a [ ] copy [ ] of [ ] the [ ] license [ ] is).){0,1024}| (?:[ ] copy [ ] of [ ] the [ ] license [ ] is.{0,1024}?))) gnu [ ] free [ ] documentation [ ] license (?'rawgfdlsections'(?:(?!gnu [ ] free [ ] documentation [ ] license).){0,1024}?) (?:a [ ] copy [ ] of [ ] the [ ] license [ ] is| this [ ] document [ ] is [ ] distributed)}msx, callsub => 'check_gfdl_license_problem' }, # php license 'license-problem-php-license' => { keywords => [qw{www.php.net group\@php.net phpfoo conjunction php}], sentences => ['this product includes php'], regex => qr{php [ ] license [ ]?[,;][ ]? version [ ] 3(?:\.\d+)?}msx, callsub => 'php_source_whitelist' }, 'license-problem-bad-php-license' => { keywords => [qw{www.php.net add-on conjunction}], sentences => ['this product includes php'], regex => qr{php [ ] license [ ]?[,;][ ]? version [ ] 2(?:\.\d+)?}msx, callsub => 'php_source_whitelist' }, # cc by nc sa note that " is replaced by [ ] 'license-problem-cc-by-nc-sa' => { keywords => [qw{license by-nc-sa creativecommons.org}], sentences => [ '://creativecommons.org/licenses/by-nc-sa', 'under attribution-noncommercial' ], regex => qr{(?:license [ ] rdf:[^=:]+=[ ]* (?:ht|f)tps?://(?:[^/.]\.)??creativecommons\.org/licenses/by-nc-sa/\d+(?:\.\d+)?(?:/[[:alpha:]]+)?/? [ ]* >|available [ ] under [ ] attribution-noncommercial)}msx }, # not really a license but warn it: visual c++ generated file 'source-contains-autogenerated-visual-c++-file' => { keywords => [qw{microsoft visual generated}], sentences => ['microsoft visual c++ generated'], regex => qr{microsoft [ ] visual [ ] c[+][+] [ ] generated (?![ ] by [ ] freeze\.py)}msx }, # not really a license but warn about it: gperf generated file 'source-contains-autogenerated-gperf-data' => { keywords => [qw{code produced gperf version}], sentences => ['code produced by gperf version'], regex => qr{code [ ] produced [ ] by [ ] gperf [ ] version [ ] \d+\.\d+}msx }, # warn about copy of ieee-data 'source-contains-data-from-ieee-data-oui-db' => { keywords => [qw{struck scitex racore}], sentences => ['dr. b. struck'], regex => qr{dr. [ ] b. [ ] struck}msx }, # warn about unicode license for utf for convert utf 'license-problem-convert-utf-code' => { keywords => [qw{fall-through bytestowrite utf-8}], sentences => ['the fall-through switches in utf-8 reading'], regex => qr{the [ ] fall-through [ ] switches [ ] in [ ] utf-8 [ ] reading [ ] code [ ] save}msx } ); # get usual data about admissible/not admissible GFDL invariant part of license has GFDL_FRAGMENTS => ( is => 'rw', lazy => 1, default => sub { my ($self) = @_; my %gfdl_fragments; my $data = $self->data->load('cruft/gfdl-license-fragments-checks', qr/\s*\~\~\s*/); for my $gfdlsectionsregex ($data->all) { my $secondpart = $data->value($gfdlsectionsregex); # allow empty parameters $secondpart //= $EMPTY; my ($acceptonlyinfile,$applytag) = split(/\s*\~\~\s*/, $secondpart, 2); $acceptonlyinfile //= $EMPTY; $applytag //= $EMPTY; # trim both ends $acceptonlyinfile =~ s/^\s+|\s+$//g; $applytag =~ s/^\s+|\s+$//g; # accept all files if empty $acceptonlyinfile ||= $DOT . $ASTERISK; my %ret = ( 'gfdlsectionsregex' => qr/$gfdlsectionsregex/xis, 'acceptonlyinfile' => qr/$acceptonlyinfile/xs, ); $ret{'tag'} = $applytag if length $applytag; $gfdl_fragments{$gfdlsectionsregex} = \%ret; } return \%gfdl_fragments; } ); sub visit_patched_files { my ($self, $item) = @_; return unless $item->is_file; # license string in debian/changelog are probably just change # Ignore these strings in d/README.{Debian,source}. If they # appear there it is probably just "file XXX got removed # because of license Y". $self->full_text_check($item) unless $item->name eq 'debian/changelog' && $item->name eq 'debian/README.Debian' && $item->name eq 'debian/README.source'; return; } # do basic license check against well known offender # note that it does not replace licensecheck(1) # and is only used for autoreject by ftp-master sub full_text_check { my ($self, $item) = @_; return undef unless $item ->is_regular_file; open(my $fd, '<:raw', $item->unpacked_path) or die encode_utf8('Cannot open ' . $item->unpacked_path); my $sfd = Lintian::SlidingWindow->new; $sfd->handle($fd); $sfd->blocksize(BLOCKSIZE); $sfd->blocksub(sub { $_ = lc; }); unless (-T $fd) { close($fd); return undef; } # we try to read this file in block and use a sliding window # for efficiency. We store two blocks in @queue and the whole # string to match in $block. Please emit license tags only once # per file BLOCK: while (my $lowercase = $sfd->readwindow()) { my $blocknumber = $sfd->blocknumber(); my $clean = clean_text($lowercase); # Check for non-distributable files - this # applies even to non-free, as we still need # permission to distribute those. # nvdia opencv infamous license last BLOCK if $self->check_for_single_bad_license($item, $lowercase, $clean, 'license-problem-nvidia-intellectual', \%NVIDIA_LICENSE); unless ($self->processable->is_non_free) { for my $tag_name (keys %NON_FREE_LICENSES) { last BLOCK if $self->check_for_single_bad_license($item, $lowercase, $clean,$tag_name, $NON_FREE_LICENSES{$tag_name}); } } # check javascript in html file if ($item->basename =~ /\.(?:x?html?\d?|xht)$/i) { my $blockscript = $lowercase; my $indexscript; while (($indexscript = index($blockscript, ' $ITEM_NOT_FOUND){ $blockscript = substr($blockscript,$indexscript); # sourced script ok if ($blockscript =~ m{\A]*?src="[^"]+?"[^>]*?>}sm) { $blockscript = substr($blockscript,$+[0]); next; } # extract script if ($blockscript =~ m{]*?>(.*?)}sm) { $blockscript = substr($blockscript,$+[0]); my $lcscript = $1; # check if js script is minified my $firstline = $EMPTY; for my $line (split /\n/, $lcscript) { if ($line =~ /^\s*$/) { next; } else { $firstline = $line; last; } } if ($firstline =~ m/.{0,20}((?:\bcopyright\b|[\(]c[\)]\s*\w|\N{COPYRIGHT SIGN}).{0,50})/ ){ my $extract = $1; $extract =~ s/^\s+|\s+$//g; $self->pointed_hint( 'embedded-script-includes-copyright-statement', $item->pointer, 'extract of copyright statement:', $extract ); } # clean up jslint craps line my $cleaned = $lcscript; $cleaned =~ s{^\s*/[*][^\n]*[*]/\s*$}{}gm; $cleaned =~ s{^\s*//[^\n]*$}{}gm; $cleaned =~ s/^\s+//gm; # strip indentation $cleaned =~ s/^\s+//mg; $cleaned = _strip_c_comments($cleaned); # strip empty line $cleaned =~ s/^\s*\n//mg; # remove last \n $cleaned =~ s/\n\Z//m; # detect browserified javascript (comment are removed here and code is stripped) my $contiguous = $cleaned; $contiguous =~ s/\n/ /msg; # get browserified regexp my $BROWSERIFY_REGEX = $self->data->load('cruft/browserify-regex', qr/\s*\~\~\s*/); for my $condition ($BROWSERIFY_REGEX->all) { my $pattern = $BROWSERIFY_REGEX->value($condition); if ($contiguous =~ m{$pattern}msx) { my $extra = (defined $1) ? 'code fragment:'.$1 : $EMPTY; $self->pointed_hint( 'source-contains-browserified-javascript', $item->pointer, $extra); last; } } next; } last; } } # check if file is javascript but not minified my $isjsfile = ($item->name =~ m/\.js$/) ? 1 : 0; if ($isjsfile) { my $minjsregexp = qr/(?i)[-._](?:compiled|compressed|lite|min|pack(?:ed)?|prod|umd|yc)\.js$/; $isjsfile = ($item->name =~ m{$minjsregexp}) ? 0 : 1; } if ($isjsfile) { # exception sphinx documentation if ($item->basename eq 'searchindex.js') { if ($lowercase =~ m/\A\s*search\.setindex\s* \s* \(\s*\{/xms) { $self->pointed_hint( 'source-contains-prebuilt-sphinx-documentation', $item->parent_dir->pointer); last BLOCK; } } if ($item->basename eq 'search_index.js') { if ($lowercase =~ m/\A\s*var\s*search_index\s*=/xms) { $self->pointed_hint( 'source-contains-prebuilt-pandoc-documentation', $item->parent_dir->pointer); last BLOCK; } } # false positive in dx package at least elsif ($item->basename eq 'srchidx.js') { last BLOCK if $lowercase =~ m/\A\s*profiles \s* = \s* new \s* Array\s*\(/xms; } # https://github.com/rafaelp/css_browser_selector is actually the # original source. (#874381) elsif ($lowercase =~ m/css_browser_selector\(/) { last BLOCK; } # Avoid false-positives in Jush's syntax highlighting definition files. elsif ($lowercase =~ m/jush\.tr\./) { last BLOCK; } # now search hidden minified # clean up jslint craps line my $cleaned = $lowercase; $cleaned =~ s{^\s*/[*][^\n]*[*]/\s*$}{}gm; $cleaned =~ s{^\s*//[^\n]*$}{}gm; $cleaned =~ s/^\s+//gm; # strip indentation $cleaned =~ s/^\s+//mg; $cleaned = _strip_c_comments($cleaned); # strip empty line $cleaned =~ s/^\s*\n//mg; # remove last \n $cleaned =~ s/\n\Z//m; # detect browserified javascript (comment are removed here and code is stripped) my $contiguous = $cleaned; $contiguous =~ s/\n/ /msg; # get browserified regexp my $BROWSERIFY_REGEX = $self->data->load('cruft/browserify-regex',qr/\s*\~\~\s*/); for my $condition ($BROWSERIFY_REGEX->all) { my $pattern = $BROWSERIFY_REGEX->value($condition); if ($contiguous =~ m{$pattern}msx) { my $extra = (defined $1) ? 'code fragment:'.$1 : $EMPTY; $self->pointed_hint( 'source-contains-browserified-javascript', $item->pointer, $extra); last; } } } # search link rel header if ($lowercase =~ / \Q rel="copyright" \E /msx) { my $href = $lowercase; $href =~ m{}xmsi; my $url = $1 // $EMPTY; $self->pointed_hint('license-problem-cc-by-nc-sa', $item->pointer) if $url =~ m{^https?://creativecommons.org/licenses/by-nc-sa/}; } last BLOCK; } return close($fd); } # strip C comment # warning block is at more 8192 char in order to be too slow # and in order to avoid regex recursion sub _strip_c_comments { my ($lowercase) = @_; # from perl faq strip comments $lowercase =~ s{ # Strip /* */ comments /\* [^*]*+ \*++ (?: [^/*][^*]*+\*++ ) */ # Strip // comments (C++ style) | // (?: [^\\] | [^\n][\n]? )*? (?=\n) | ( # Keep "/* */" (etc) as is "(?: \\. | [^"\\]++)*" # Keep '/**/' (etc) as is | '(?: \\. | [^'\\]++)*' # Keep anything else | .[^/"'\\]*+ ) }{defined $1 ? $1 : ""}xgse; return $lowercase; } # return True in case of license problem sub check_gfdl_license_problem { my ($self, $item, $tag_name, %matchedhash) = @_; my $rawgfdlsections = $matchedhash{rawgfdlsections} || $EMPTY; my $rawcontextbefore = $matchedhash{rawcontextbefore} || $EMPTY; # strip punctuation my $gfdlsections = _strip_punct($rawgfdlsections); my $contextbefore = _strip_punct($rawcontextbefore); # remove line number at beginning of line # see krusader/1:2.4.0~beta3-2/doc/en_US/advanced-functions.docbook/ $gfdlsections =~ s{[ ]\d+[ ]}{ }gxsmo; $gfdlsections =~ s{^\d+[ ]}{ }xsmo; $gfdlsections =~ s{[ ]\d+$}{ }xsmo; $gfdlsections =~ s{[ ]+}{ }xsmo; # remove classical and without meaning part of # matched string my $oldgfdlsections; do { $oldgfdlsections = $gfdlsections; $gfdlsections =~ s{ \A \(?[ ]? g?fdl [ ]?\)?[ ]? [,\.;]?[ ]?}{}xsmo; $gfdlsections =~ s{ \A (?:either[ ])? version [ ] \d+(?:\.\d+)? [ ]?}{}xsmo; $gfdlsections =~ s{ \A of [ ] the [ ] license [ ]?[,\.;][ ]?}{}xsmo; $gfdlsections=~ s{ \A or (?:[ ]\(?[ ]? at [ ] your [ ] option [ ]?\)?)? [ ] any [ ] later [ ] version[ ]?}{}xsmo; $gfdlsections =~ s{ \A (as[ ])? published [ ] by [ ] the [ ] free [ ] software [ ] foundation[ ]?}{}xsmo; $gfdlsections =~ s{\(?[ ]? fsf [ ]?\)?[ ]?}{}xsmo; $gfdlsections =~ s{\A [ ]? [,\.;]? [ ]?}{}xsmo; $gfdlsections =~ s{[ ]? [,\.]? [ ]?\Z}{}xsmo; } while ($oldgfdlsections ne $gfdlsections); $contextbefore =~ s{ [ ]? (:?[,\.;]? [ ]?)? permission [ ] is [ ] granted [ ] to [ ] copy [ ]?[,\.;]?[ ]? distribute [ ]?[,\.;]?[ ]? and[ ]?/?[ ]?or [ ] modify [ ] this [ ] document [ ] under [ ] the [ ] terms [ ] of [ ] the\Z}{}xsmo; # Treat ambiguous empty text if ($gfdlsections eq $EMPTY) { # lie in order to check more part $self->pointed_hint('license-problem-gfdl-invariants-empty', $item->pointer); return 0; } # official wording if( $gfdlsections =~ m{\A with [ ] no [ ] invariant [ ] sections[ ]?, [ ]? no [ ] front(?:[ ]?-[ ]?|[ ])cover [ ] texts[ ]?,? [ ]? and [ ] no [ ] back(?:[ ]?-?[ ]?|[ ])cover [ ] texts \Z}xs ) { return 0; } # example are ok if ( $contextbefore =~ m{following [ ] is [ ] an [ ] example (:?[ ] of [ ] the [ ] license [ ] notice [ ] to [ ] use (?:[ ] after [ ] the [ ] copyright [ ] (?:line(?:\(s\)|s)?)? (?:[ ] using [ ] all [ ] the [ ] features? [ ] of [ ] the [ ] gfdl)? )? )? [ ]? [,:]? \Z}xs ){ return 0; } # GFDL license, assume it is bad unless it # explicitly states it has no "bad sections". for my $gfdl_fragment (keys %{$self->GFDL_FRAGMENTS}) { my $gfdl_data = $self->GFDL_FRAGMENTS->{$gfdl_fragment}; my $gfdlsectionsregex = $gfdl_data->{'gfdlsectionsregex'}; if ($gfdlsections =~ m{$gfdlsectionsregex}) { my $acceptonlyinfile = $gfdl_data->{'acceptonlyinfile'}; if ($item->name =~ m{$acceptonlyinfile}) { my $applytag = $gfdl_data->{'tag'}; # lie will allow checking more blocks $self->pointed_hint($applytag, $item->pointer, 'invariant part is:', $gfdlsections) if defined $applytag; return 0; } else { $self->pointed_hint( 'license-problem-gfdl-invariants', $item->pointer,'invariant part is:', $gfdlsections ); return 1; } } } # catch all $self->pointed_hint( 'license-problem-gfdl-invariants', $item->pointer,'invariant part is:', $gfdlsections ); return 1; } sub rfc_whitelist_filename { my ($self, $item, $tag_name, %matchedhash) = @_; return 0 if $item->name eq 'debian/copyright'; my $lcname = lc($item->basename); # prebuilt-file or forbidden file type # specified separator protects against spaces in pattern my $RFC_WHITELIST= $self->data->load('cruft/rfc-whitelist',qr/\s*\~\~\s*/); my @patterns = $RFC_WHITELIST->all; return 0 if any { $lcname =~ m/ $_ /xms } @patterns; $self->pointed_hint($tag_name, $item->pointer); return 1; } sub php_source_whitelist { my ($self, $item, $tag_name, %matchedhash) = @_; my $copyright_path = $self->processable->patched->resolve_path('debian/copyright'); return 0 if defined $copyright_path && $copyright_path->bytes =~ m{^Source: https?://(pecl|pear).php.net/package/.*$}m; return 0 if $self->processable->source_name =~ /^php\d*(?:\.\d+)?$/xms; $self->pointed_hint($tag_name, $item->pointer); return 1; } sub clean_text { my ($text) = @_; # be paranoiac replace gnu with texinfo by gnu $text =~ s{ (?:@[[:alpha:]]*?\{)?\s*gnu\s*\} # Texinfo cmd }{ gnu }gxms; # pod2man formatting $text =~ s{ \\ \* \( [LR] \" }{\"}gxsm; $text =~ s{ \\ -}{-}gxsm; # replace some shortcut (clisp) $text =~ s{\(&fdl;\)}{ }gxsm; $text =~ s{&fsf;}{free software foundation}gxsm; # non breaking space $text =~ s{ }{ }gxsm; # replace some common comment-marker/markup with space $text =~ s{^\.\\\"}{ }gxms; # man comments # po comment may include html tag $text =~ s/\"\s?\v\#~\s?\"//gxms; # strip .rtf paragraph marks (#892967) $text =~ s/\\par\b//gxms; $text =~ s/\\url[{][^}]*?[}]/ /gxms; # (la)?tex url $text =~ s/\\emph[{]/ /gxms; # (la)?tex emph $text =~ s<\\href[{][^}]*?[}] [{]([^}]*?)[}]>< $1 >gxms;# (la)?tex href $text =~ s<\\hyperlink [{][^}]*?[}] [{]([^}]*?)[}]>< $1 >gxms; # (la)?tex hyperlink $text =~ s{-\\/}{-}gxms; # tex strange hyphen $text =~ s/\\char/ /gxms; # tex char command # Texinfo comment with end section $text =~ s{\@c(?:omment)?\h+ end \h+ ifman\s+}{ }gxms; $text =~ s{\@c(?:omment)?\s+ noman\s+}{ }gxms; # Texinfo comment no manual $text =~ s/\@c(?:omment)?\s+/ /gxms; # Texinfo comment # Texinfo bold,italic, roman, fixed width $text =~ s/\@[birt][{]/ /gxms; $text =~ s/\@sansserif[{]/ /gxms; # Texinfo sans serif $text =~ s/\@slanted[{]/ /gxms; # Texinfo slanted $text =~ s/\@var[{]/ /gxms; # Texinfo emphasis $text =~ s/\@(?:small)?example\s+/ /gxms; # Texinfo example $text =~ s{\@end \h+ (?:small)example\s+}{ }gxms; # Texinfo end example tag $text =~ s/\@group\s+/ /gxms; # Texinfo group $text =~ s/\@end\h+group\s+/ /gxms; # Texinfo end group $text =~ s// /gxms; # end XML comment $text =~ s{]*?>}{ }gxms; # a link $text =~ s{}{ }gxms; # (X)?HTML line # breaks $text =~ s{]*?>}{ }gxms; # DocBook citation title $text =~ s{]*?>}{ }gxms; # html style $text =~ s{]*?>}{ }gxms; # bold $text =~ s{]*?>}{ }gxms; # italic $text =~ s{]*?>}{ }gxms; # italic $text =~ s{]*?>}{ }gxms; # xml link $text =~ s{]*?>}{ }gxms; # html paragraph $text =~ s{]*?>}{ }gxms; # xml quote $text =~ s{]*?>}{ }gxms; # span tag $text =~ s{]*?>}{ }gxms; # ulink DocBook $text =~ s{]*?>}{ }gxms; # var used by texinfo2html $text =~ s{\&[lr]dquo;}{ }gxms; # html rquote $text =~ s{\(\*note.*?::\)}{ }gxms; # info file note # String array (e.g. "line1",\n"line2") $text =~ s/\"\s*,/ /gxms; # String array (e.g. "line1"\n ,"line2"), $text =~ s/,\s*\"/ /gxms; $text =~ s/\\n/ /gxms; # Verbatim \n in string array $text =~ s/\\&/ /gxms; # pod2man formatting $text =~ s/\\s(?:0|-1)/ /gxms; # pod2man formatting $text =~ s/(?:``|'')/ /gxms; # quote like # diff/patch lines (should be after html tag) $text =~ s/^[-\+!<>]/ /gxms; $text =~ s{\@\@ \s* [-+] \d+,\d+ \s+ [-+] \d+,\d+ \s* \@\@}{ }gxms; # patch line # Texinfo end tag (could be more clever but brute force is fast) $text =~ s/}/ /gxms; # Tex section titles $text =~ s/^\s*\\(sub)*section\*?\{\s*\S+/ /gxms; # single char at end # String, C-style comment/javadoc indent, # quotes for strings, pipe and backslash, tilde in some txt $text =~ s/[%\*\"\|\\\#~]/ /gxms; # delete double spacing now and normalize spacing # to space character $text =~ s{\s++}{ }gsm; # trim both ends $text =~ s/^\s+|\s+$//g; return $text; } # do not use space around punctuation sub _strip_punct() { my ($text) = @_; # replace final punctuation $text =~ s{(?: \s*[,\.;]\s*\Z | # final punctuation \A\s*[,\.;]\s* # punctuation at the beginning )}{ }gxms; # delete double spacing now and normalize spacing # to space character $text =~ s{\s++}{ }gsm; # trim both ends $text =~ s/^\s+|\s+$//g; return $text; } sub check_for_single_bad_license { my ($self, $item, $lowercase, $clean, $tag_name, $license_data) = @_; # do fast keyword search # could make more sense as 'return 1 unless all' but does not work return 0 if none { $lowercase =~ / \Q$_\E /msx } @{$license_data->{keywords}}; return 0 if none { $clean =~ / \Q$_\E /msx }@{$license_data->{sentences}}; my $regex = $license_data->{regex}; return 0 if defined $regex && $clean !~ $regex; my $callsub = $license_data->{callsub}; if (!defined $callsub) { $self->pointed_hint($tag_name, $item->pointer); return 1; } return $self->$callsub($item, $tag_name, %+); } 1; # Local Variables: # indent-tabs-mode: nil # cperl-indent-level: 4 # End: # vim: syntax=perl sw=4 sts=4 sr et