#!/usr/bin/env perl use warnings; use strict; use Pod::Usage; use Getopt::Long; use Time::HiRes qw(gettimeofday tv_interval); use JSON::XS; use String::ShellQuote; use FileHandle; use IPC::Open2; use Data::Dumper; my $spam_dir; my $ham_dir; my $parallel = 1; my $classifier = "bayes"; my $spam_symbol = "BAYES_SPAM"; my $ham_symbol = "BAYES_HAM"; my $timeout = 10; my $rspamc = $ENV{'RSPAMC'} || "rspamc"; my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter"; my $dspam = $ENV{'DSPAM'} || "dspam"; my $train_fraction = 0.5; my $use_bogofilter = 0; my $use_dspam = 0; my $check_only = 0; my $rspamc_prob_trigger = 95; my $man; my $help; GetOptions( "spam|s=s" => \$spam_dir, "ham|h=s" => \$ham_dir, "spam-symbol=s" => \$spam_symbol, "ham-symbol=s" => \$ham_symbol, "classifier|c=s" => \$classifier, "timeout|t=f" => \$timeout, "parallel|p=i" => \$parallel, "train-fraction|t=f" => \$train_fraction, "bogofilter|b" => \$use_bogofilter, "dspam|d" => \$use_dspam, "check-only" => \$check_only, "help|?" => \$help, "man" => \$man ) or pod2usage(2); pod2usage(1) if $help; pod2usage( -exitval => 0, -verbose => 2 ) if $man; sub read_dir_files { my ( $dir, $target ) = @_; opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; while ( my $file = readdir $dh ) { if ( -f "$dir/$file" ) { push @{$target}, "$dir/$file"; } } } sub shuffle_array { my ($ar) = @_; for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { if ( $i > 1 ) { my $sel = int( rand( $i - 1 ) ); ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); } } } sub learn_rspamc { my ( $files, $spam ) = @_; my $processed = 0; my $cmd = $spam ? "learn_spam" : "learn_ham"; my $args_quoted = shell_quote @{$files}; open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) or die "cannot spawn $rspamc: $!"; while (<$p>) { my $res = eval('decode_json($_)'); if ( $res && $res->{'success'} ) { $processed++; } } return $processed; } sub learn_bogofilter { my ( $files, $spam ) = @_; my $processed = 0; foreach my $f ( @{$files} ) { my $args_quoted = shell_quote $f; my $fl = $spam ? "-s" : "-n"; `$bogofilter -I $args_quoted $fl`; if ( $? == 0 ) { $processed++; } } return $processed; } sub learn_dspam { my ( $files, $spam ) = @_; my $processed = 0; foreach my $f ( @{$files} ) { my $args_quoted = shell_quote $f; my $fl = $spam ? "--class=spam" : "--class=innocent"; open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) or die "cannot run $dspam: $!"; open( my $inp, "< $f" ); while (<$inp>) { print $p $_; } } return $processed; } sub learn_samples { my ( $ar_ham, $ar_spam ) = @_; my $len; my $processed = 0; my $total = 0; my $learn_func; my @files_spam; my @files_ham; if ($use_dspam) { $learn_func = \&learn_dspam; } elsif ($use_bogofilter) { $learn_func = \&learn_bogofilter; } else { $learn_func = \&learn_rspamc; } $len = int( scalar @{$ar_ham} * $train_fraction ); my @cur_vec; # Shuffle spam and ham samples for ( my $i = 0 ; $i < $len ; $i++ ) { if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { push @cur_vec, @{$ar_ham}[$i]; push @files_ham, [@cur_vec]; @cur_vec = (); $total++; } else { push @cur_vec, @{$ar_ham}[$i]; } } $len = int( scalar @{$ar_spam} * $train_fraction ); @cur_vec = (); for ( my $i = 0 ; $i < $len ; $i++ ) { if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { push @cur_vec, @{$ar_spam}[$i]; push @files_spam, [@cur_vec]; @cur_vec = (); $total++; } else { push @cur_vec, @{$ar_spam}[$i]; } } for ( my $i = 0 ; $i < $total ; $i++ ) { my $args; my $spam; if ( $i % 2 == 0 ) { $args = pop @files_spam; if ( !$args ) { $args = pop @files_ham; $spam = 0; } else { $spam = 1; } } else { $args = pop @files_ham; if ( !$args ) { $args = pop @files_spam; $spam = 1; } else { $spam = 0; } } my $r = $learn_func->( $args, $spam ); if ($r) { $processed += $r; } } return $processed; } sub check_rspamc { my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; my $args_quoted = shell_quote @{$files}; my $processed = 0; open( my $p, "$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" ) or die "cannot spawn $rspamc: $!"; while (<$p>) { my $res = eval('decode_json($_)'); if ( $res && $res->{'default'} ) { $processed++; if ($spam) { if ( $res->{'default'}->{$ham_symbol} ) { my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { my $percentage = int($1); if ( $percentage >= $rspamc_prob_trigger ) { $$fp_cnt++; } } else { $$fp_cnt++; } } elsif ( !$res->{'default'}->{$spam_symbol} ) { $$fn_cnt++; } else { $$detected_cnt++; } } else { if ( $res->{'default'}->{$spam_symbol} ) { my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { my $percentage = int($1); if ( $percentage >= $rspamc_prob_trigger ) { $$fp_cnt++; } } else { $$fp_cnt++; } } elsif ( !$res->{'default'}->{$ham_symbol} ) { $$fn_cnt++; } else { $$detected_cnt++; } } } } return $processed; } sub check_bogofilter { my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; my $processed = 0; foreach my $f ( @{$files} ) { my $args_quoted = shell_quote $f; open( my $p, "$bogofilter -t -I $args_quoted |" ) or die "cannot spawn $bogofilter: $!"; while (<$p>) { if ( $_ =~ /^([SHU])\s+.*$/ ) { $processed++; if ($spam) { if ( $1 eq 'H' ) { $$fp_cnt++; } elsif ( $1 eq 'U' ) { $$fn_cnt++; } else { $$detected_cnt++; } } else { if ( $1 eq 'S' ) { $$fp_cnt++; } elsif ( $1 eq 'U' ) { $$fn_cnt++; } else { $$detected_cnt++; } } } } } return $processed; } sub check_dspam { my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; my $processed = 0; foreach my $f ( @{$files} ) { my $args_quoted = shell_quote $f; my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); open( my $inp, "< $f" ); while (<$inp>) { print Writer $_; } close Writer; while () { if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { $processed++; my $percentage = int( $2 * 100.0 ); if ($spam) { if ( $1 eq 'Innocent' ) { if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { $$fp_cnt++; } } elsif ( $1 ne 'Spam' ) { $$fn_cnt++; } else { $$detected_cnt++; } } else { if ( $1 eq 'Spam' ) { if ( $percentage >= $rspamc_prob_trigger ) { $$fp_cnt++; } } elsif ( $1 ne 'Innocent' ) { $$fn_cnt++; } else { $$detected_cnt++; } } } } close Reader; waitpid( $pid, 0 ); } return $processed; } sub cross_validate { my ($hr) = @_; my $args = ""; my $processed = 0; my $fp_spam = 0; my $fn_spam = 0; my $fp_ham = 0; my $fn_ham = 0; my $total_spam = 0; my $total_ham = 0; my $detected_spam = 0; my $detected_ham = 0; my $i = 0; my $len = scalar keys %{$hr}; my @files_spam; my @files_ham; my @cur_spam; my @cur_ham; my $check_func; if ($use_dspam) { $check_func = \&check_dspam; } elsif ($use_bogofilter) { $check_func = \&check_bogofilter; } else { $check_func = \&check_rspamc; } while ( my ( $fn, $spam ) = each( %{$hr} ) ) { if ($spam) { if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { push @cur_spam, $fn; push @files_spam, [@cur_spam]; @cur_spam = (); } else { push @cur_spam, $fn; } } else { if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { push @cur_ham, $fn; push @files_ham, [@cur_ham]; @cur_ham = (); } else { push @cur_ham, $fn; } } } shuffle_array( \@files_spam ); foreach my $fn (@files_spam) { my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); $total_spam += $r; $processed += $r; } shuffle_array( \@files_ham ); foreach my $fn (@files_ham) { my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); $total_ham += $r; $processed += $r; } printf "Scanned %d messages %d spam messages (%d detected) %d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; printf "\nHam FP rate: %.2f%% (%d messages) Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; printf "\nSpam FP rate: %.2f%% (%d messages) Spam FN rate: %.2f%% (%d messages)\n", $fp_spam / $total_spam * 100.0, $fp_spam, $fn_spam / $total_spam * 100.0, $fn_spam; } if ( !$spam_dir || !$ham_dir ) { die "spam or/and ham directories are not specified"; } my @spam_samples; my @ham_samples; read_dir_files( $spam_dir, \@spam_samples ); read_dir_files( $ham_dir, \@ham_samples ); shuffle_array( \@spam_samples ); shuffle_array( \@ham_samples ); if ( !$check_only ) { my $learned = 0; my $t0 = [gettimeofday]; $learned = learn_samples( \@ham_samples, \@spam_samples ); my $t1 = [gettimeofday]; printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); } my %validation_set; my $len = int( scalar @spam_samples * $train_fraction ); for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { $validation_set{ $spam_samples[$i] } = 1; } $len = int( scalar @ham_samples * $train_fraction ); for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { $validation_set{ $ham_samples[$i] } = 0; } cross_validate( \%validation_set ); __END__ =head1 NAME classifier_test.pl - test various parameters for a classifier =head1 SYNOPSIS classifier_test.pl [options] Options: --spam Directory with spam files --ham Directory with ham files --spam-symbol Symbol for spam (default: BAYES_SPAM) --ham-symbol Symbol for ham (default: BAYES_HAM) --classifier Classifier to test (default: bayes) --timeout Timeout for rspamc (default: 10) --parallel Parallel execution (default: 1) --help Brief help message --man Full documentation =head1 OPTIONS =over 8 =item B<--spam> Directory with spam files. =item B<--ham> Directory with ham files. =item B<--classifier> Specifies classifier name to test. =item B<--help> Print a brief help message and exits. =item B<--man> Prints the manual page and exits. =back =head1 DESCRIPTION B is intended to test Rspamd classifier for false positives, false negatives and other parameters. It uses half of the corpus for training and half for cross-validation. =cut