diff options
Diffstat (limited to '')
-rw-r--r-- | utils/classifier_test.pl | 539 |
1 files changed, 539 insertions, 0 deletions
diff --git a/utils/classifier_test.pl b/utils/classifier_test.pl new file mode 100644 index 0000000..238417f --- /dev/null +++ b/utils/classifier_test.pl @@ -0,0 +1,539 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use Pod::Usage; +use Getopt::Long; +use Time::HiRes qw(gettimeofday tv_interval); +use JSON::XS; +use String::ShellQuote; +use FileHandle; +use IPC::Open2; +use Data::Dumper; + +my $spam_dir; +my $ham_dir; +my $parallel = 1; +my $classifier = "bayes"; +my $spam_symbol = "BAYES_SPAM"; +my $ham_symbol = "BAYES_HAM"; +my $timeout = 10; +my $rspamc = $ENV{'RSPAMC'} || "rspamc"; +my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter"; +my $dspam = $ENV{'DSPAM'} || "dspam"; +my $train_fraction = 0.5; +my $use_bogofilter = 0; +my $use_dspam = 0; +my $check_only = 0; +my $rspamc_prob_trigger = 95; +my $man; +my $help; + +GetOptions( + "spam|s=s" => \$spam_dir, + "ham|h=s" => \$ham_dir, + "spam-symbol=s" => \$spam_symbol, + "ham-symbol=s" => \$ham_symbol, + "classifier|c=s" => \$classifier, + "timeout|t=f" => \$timeout, + "parallel|p=i" => \$parallel, + "train-fraction|t=f" => \$train_fraction, + "bogofilter|b" => \$use_bogofilter, + "dspam|d" => \$use_dspam, + "check-only" => \$check_only, + "help|?" => \$help, + "man" => \$man +) or pod2usage(2); + +pod2usage(1) if $help; +pod2usage( -exitval => 0, -verbose => 2 ) if $man; + +sub read_dir_files { + my ( $dir, $target ) = @_; + opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; + while ( my $file = readdir $dh ) { + if ( -f "$dir/$file" ) { + push @{$target}, "$dir/$file"; + } + } +} + +sub shuffle_array { + my ($ar) = @_; + + for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { + if ( $i > 1 ) { + my $sel = int( rand( $i - 1 ) ); + ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); + } + } +} + +sub learn_rspamc { + my ( $files, $spam ) = @_; + my $processed = 0; + + my $cmd = $spam ? "learn_spam" : "learn_ham"; + my $args_quoted = shell_quote @{$files}; + open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) + or die "cannot spawn $rspamc: $!"; + + while (<$p>) { + my $res = eval('decode_json($_)'); + if ( $res && $res->{'success'} ) { + $processed++; + } + } + + return $processed; +} + +sub learn_bogofilter { + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "-s" : "-n"; + `$bogofilter -I $args_quoted $fl`; + if ( $? == 0 ) { + $processed++; + } + } + + return $processed; +} + +sub learn_dspam { + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "--class=spam" : "--class=innocent"; + open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) + or die "cannot run $dspam: $!"; + + open( my $inp, "< $f" ); + while (<$inp>) { + print $p $_; + } + } + + return $processed; +} + +sub learn_samples { + my ( $ar_ham, $ar_spam ) = @_; + my $len; + my $processed = 0; + my $total = 0; + my $learn_func; + + my @files_spam; + my @files_ham; + + if ($use_dspam) { + $learn_func = \&learn_dspam; + } + elsif ($use_bogofilter) { + $learn_func = \&learn_bogofilter; + } + else { + $learn_func = \&learn_rspamc; + } + + $len = int( scalar @{$ar_ham} * $train_fraction ); + my @cur_vec; + + # Shuffle spam and ham samples + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_ham}[$i]; + push @files_ham, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_ham}[$i]; + } + } + + $len = int( scalar @{$ar_spam} * $train_fraction ); + @cur_vec = (); + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_spam}[$i]; + push @files_spam, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_spam}[$i]; + } + } + + for ( my $i = 0 ; $i < $total ; $i++ ) { + my $args; + my $spam; + + if ( $i % 2 == 0 ) { + $args = pop @files_spam; + + if ( !$args ) { + $args = pop @files_ham; + $spam = 0; + } + else { + $spam = 1; + } + } + else { + $args = pop @files_ham; + if ( !$args ) { + $args = pop @files_spam; + $spam = 1; + } + else { + $spam = 0; + } + } + + my $r = $learn_func->( $args, $spam ); + if ($r) { + $processed += $r; + } + } + + return $processed; +} + +sub check_rspamc { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + + my $args_quoted = shell_quote @{$files}; + my $processed = 0; + + open( + my $p, +"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" + ) or die "cannot spawn $rspamc: $!"; + + while (<$p>) { + my $res = eval('decode_json($_)'); + if ( $res && $res->{'default'} ) { + $processed++; + + if ($spam) { + if ( $res->{'default'}->{$ham_symbol} ) { + my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$spam_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $res->{'default'}->{$spam_symbol} ) { + my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$ham_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + + return $processed; +} + +sub check_bogofilter { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + + open( my $p, "$bogofilter -t -I $args_quoted |" ) + or die "cannot spawn $bogofilter: $!"; + + while (<$p>) { + if ( $_ =~ /^([SHU])\s+.*$/ ) { + $processed++; + + if ($spam) { + if ( $1 eq 'H' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'S' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + } + + return $processed; +} + +sub check_dspam { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + + my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); + open( my $inp, "< $f" ); + while (<$inp>) { + print Writer $_; + } + close Writer; + + while (<Reader>) { + if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { + $processed++; + my $percentage = int( $2 * 100.0 ); + + if ($spam) { + if ( $1 eq 'Innocent' ) { + if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Spam' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'Spam' ) { + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Innocent' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + close Reader; + waitpid( $pid, 0 ); + } + + return $processed; +} + +sub cross_validate { + my ($hr) = @_; + my $args = ""; + my $processed = 0; + my $fp_spam = 0; + my $fn_spam = 0; + my $fp_ham = 0; + my $fn_ham = 0; + my $total_spam = 0; + my $total_ham = 0; + my $detected_spam = 0; + my $detected_ham = 0; + my $i = 0; + my $len = scalar keys %{$hr}; + my @files_spam; + my @files_ham; + my @cur_spam; + my @cur_ham; + my $check_func; + + if ($use_dspam) { + $check_func = \&check_dspam; + } + elsif ($use_bogofilter) { + $check_func = \&check_bogofilter; + } + else { + $check_func = \&check_rspamc; + } + + while ( my ( $fn, $spam ) = each( %{$hr} ) ) { + if ($spam) { + if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { + push @cur_spam, $fn; + push @files_spam, [@cur_spam]; + @cur_spam = (); + } + else { + push @cur_spam, $fn; + } + } + else { + if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { + push @cur_ham, $fn; + push @files_ham, [@cur_ham]; + @cur_ham = (); + } + else { + push @cur_ham, $fn; + } + } + } + + shuffle_array( \@files_spam ); + + foreach my $fn (@files_spam) { + my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); + $total_spam += $r; + $processed += $r; + } + + shuffle_array( \@files_ham ); + + foreach my $fn (@files_ham) { + my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); + $total_ham += $r; + $processed += $r; + } + + printf "Scanned %d messages +%d spam messages (%d detected) +%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; + + printf "\nHam FP rate: %.2f%% (%d messages) +Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; + + printf "\nSpam FP rate: %.2f%% (%d messages) +Spam FN rate: %.2f%% (%d messages)\n", + $fp_spam / $total_spam * 100.0, $fp_spam, + $fn_spam / $total_spam * 100.0, $fn_spam; +} + +if ( !$spam_dir || !$ham_dir ) { + die "spam or/and ham directories are not specified"; +} + +my @spam_samples; +my @ham_samples; + +read_dir_files( $spam_dir, \@spam_samples ); +read_dir_files( $ham_dir, \@ham_samples ); +shuffle_array( \@spam_samples ); +shuffle_array( \@ham_samples ); + +if ( !$check_only ) { + my $learned = 0; + my $t0 = [gettimeofday]; + $learned = learn_samples( \@ham_samples, \@spam_samples ); + my $t1 = [gettimeofday]; + + printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); +} + +my %validation_set; +my $len = int( scalar @spam_samples * $train_fraction ); +for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { + $validation_set{ $spam_samples[$i] } = 1; +} + +$len = int( scalar @ham_samples * $train_fraction ); +for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { + $validation_set{ $ham_samples[$i] } = 0; +} + +cross_validate( \%validation_set ); + +__END__ + +=head1 NAME + +classifier_test.pl - test various parameters for a classifier + +=head1 SYNOPSIS + +classifier_test.pl [options] + + Options: + --spam Directory with spam files + --ham Directory with ham files + --spam-symbol Symbol for spam (default: BAYES_SPAM) + --ham-symbol Symbol for ham (default: BAYES_HAM) + --classifier Classifier to test (default: bayes) + --timeout Timeout for rspamc (default: 10) + --parallel Parallel execution (default: 1) + --help Brief help message + --man Full documentation + +=head1 OPTIONS + +=over 8 + +=item B<--spam> + +Directory with spam files. + +=item B<--ham> + +Directory with ham files. + +=item B<--classifier> + +Specifies classifier name to test. + +=item B<--help> + +Print a brief help message and exits. + +=item B<--man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<classifier_test.pl> is intended to test Rspamd classifier for false positives, false negatives and other parameters. +It uses half of the corpus for training and half for cross-validation. + +=cut |