diff options
Diffstat (limited to '')
-rwxr-xr-x | utils/rspamd_stats.pl | 1018 |
1 files changed, 1018 insertions, 0 deletions
diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl new file mode 100755 index 0000000..9c5f2ac --- /dev/null +++ b/utils/rspamd_stats.pl @@ -0,0 +1,1018 @@ +#!/usr/bin/env perl + +use 5.010; +use Data::Dumper; +use Getopt::Long; +use Pod::Usage; +use Time::Local; +use IO::Handle; +use warnings; +use strict; + +my @symbols_search; +my @symbols_exclude; +my @symbols_bidirectional; +my @symbols_groups; +my @symbols_ignored; +my %symbols_mult; +my %groups; +my $reject_score = 15.0; +my $junk_score = 6.0; +my $diff_alpha = 0.1; +my $correlations = 0; +my $nrelated = 10; +my $log_file = ""; +my $search_pattern = ""; +my $startTime = ""; +my $endTime; +my $num_logs; +my $exclude_logs = 0; +my $man = 0; +my $json = 0; +my $help = 0; + +# Associate file extensions with decompressors +my %decompressor = ( + 'bz2' => 'bzip2 -cd', + 'gz' => 'gzip -cd', + 'xz' => 'xz -cd', + 'zst' => 'zstd -cd', +); + +GetOptions( + "reject-score|r=f" => \$reject_score, + "junk-score|j=f" => \$junk_score, + "symbol|s=s@" => \@symbols_search, + "symbol-bidir|S=s@" => \@symbols_bidirectional, + "exclude|X=s@" => \@symbols_exclude, + "ignore=s@" => \@symbols_ignored, + "group|g=s@" => \@symbols_groups, + "log|l=s" => \$log_file, + "mult=s" => \%symbols_mult, + "alpha-score|alpha|a=f" => \$diff_alpha, + "correlations|c" => \$correlations, + "nrelated=i" => \$nrelated, + "search-pattern=s" => \$search_pattern, + "start=s" => \$startTime, + "end=s" => \$endTime, + "num-logs|n=i" => \$num_logs, + "exclude-logs|x=i" => \$exclude_logs, + "json|j" => \$json, + "help|?" => \$help, + "man" => \$man +) or pod2usage(2); + +pod2usage(1) if $help; +pod2usage( -exitval => 0, -verbose => 2 ) if $man; + +# Global vars +my $total = 0; +my $total_spam = 0; +my $total_junk = 0; +my $junk_symbols = 0; +my $spam_symbols = 0; +my $ham_symbols = 0; +my $ham_spam_change = 0; +my $ham_junk_change = 0; +my %sym_res; +my $rspamd_log; +my $enabled = 0; +my $log_file_num = 1; +my $spinner_update_time = 0; + +my %action; +my %timeStamp; +my %scanTime = ( + max => 0, + total => 0, +); +my %bidir_match; + +foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) } + +# Convert bidirectional symbols +foreach my $s (@symbols_bidirectional) { + $bidir_match{$s} = { + spam => "${s}_SPAM", + ham => "${s}_HAM", + }; + push @symbols_search, $s unless grep /^$s$/, @symbols_search; +} + +# Deal with groups +my $group_id = 0; +foreach my $g (@symbols_groups) { + my @symbols = split /,/, $g; + my $group_name = "group$group_id"; + + foreach my $s (@symbols) { + $groups{$s} = $group_name; + push @symbols_search, $s unless grep /^$s$/, @symbols_search; + } +} + +@symbols_search = '.*' + unless @symbols_search; + +if ( $log_file eq '-' || $log_file eq '' ) { + $rspamd_log = \*STDIN; + &ProcessLog(); +} +elsif ( -d "$log_file" ) { + my $log_dir = "$log_file"; + + my @logs = &GetLogfilesList($log_dir); + + # Process logs + foreach (@logs) { + my $ext = (/[^.]+\.?([^.]*?)$/)[0]; + my $dc = $decompressor{$ext} || 'cat'; + + open( $rspamd_log, "-|", "$dc $log_dir/$_" ) + or die "cannot execute $dc $log_dir/$_ : $!"; + + printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, + $_; + $spinner_update_time = 0; # Force spinner update + &spinner; + + &ProcessLog; + + close($rspamd_log) + or warn "cannot close $dc $log_dir/$_: $!"; + } + print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up +} +else { + my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0]; + my $dc = $decompressor{$ext} || 'cat'; + open( $rspamd_log, "-|", "$dc $log_file" ) + or die "cannot execute $dc $log_file : $!"; + $spinner_update_time = 0; # Force spinner update + &spinner; + &ProcessLog(); +} + +my $total_ham = $total - ( $total_spam + $total_junk ); + +if ($json) { + print "{"; + &Summary(); + print '"symbols":{'; + &SymbolsStat(); + print "}}\n"; +} +else { + &SymbolsStat(); + &Summary(); +} + +exit; + +sub IsIgnored { + my ($sym) = @_; + + foreach my $ex (@symbols_ignored) { + if ( $sym =~ /^$ex$/ ) { + return 1; + } + } + + return 0; +} + +sub GenRelated { + my ( $htb, $target_sym ) = @_; + + my @result; + my $i = 0; + foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) { + if ( $sym ne $target_sym ) { + my @elt = ( $sym, $htb->{$sym} ); + push @result, \@elt; + $i++; + } + + last if $i > $nrelated; + } + + return \@result; +} + +sub StringifyRelated { + my ( $ar, $total ) = @_; + return + join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) ); +} + +sub SymbolsStat { + if ( $total > 0 ) { + my $has_comma = 0; + while ( my ( $s, $r ) = each(%sym_res) ) { + if ( $r->{hits} > 0 ) { + my $th = $r->{hits}; + my $sh = $r->{spam_hits}; + my $jh = $r->{junk_hits}; + my $hh = $r->{hits} - $sh - $jh; + my ( $htp, $stp, $jtp ); + $htp = $hh * 100.0 / $total_ham if $total_ham != 0; + $stp = $sh * 100.0 / $total_spam if $total_spam != 0; + $jtp = $jh * 100.0 / $total_junk if $total_junk != 0; + + if ($json) { + if ($has_comma) { + print ","; + } + else { + $has_comma = 1; + } + print "\"$s\":{"; + JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" ); + print ","; + JsonObjectElt( "hits", $th, "%d" ); + print ","; + JsonObjectElt( "hits_percentage", $th / $total, "%.4f" ); + print ","; + JsonObjectElt( "spam_hits", $sh, "%d" ); + print ","; + JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" ); + print ","; + JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" ); + print ","; + JsonObjectElt( "ham_hits", $hh, "%d" ); + print ","; + JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" ); + print ","; + JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" ); + print ","; + JsonObjectElt( "junk_hits", $jh, "%d" ); + print ","; + JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" ); + print ","; + JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" ); + } + else { + printf "%s avg. weight %.3f, hits %d(%.3f%%): + Ham %7.3f%%, %6d/%-6d (%7.3f%%) + Spam %7.3f%%, %6d/%-6d (%7.3f%%) + Junk %7.3f%%, %6d/%-6d (%7.3f%%) +", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ), + ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ), + ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ), + ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 ); + } + my ( $schp, $jchp ); + $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam; + $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk; + + if ( $r->{weight} != 0 ) { + if ( !$json ) { + if ( $r->{weight} > 0 ) { + printf " +Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%) +Spam changes / total spam hits: %6d/%-6d (%7.3f%%) +Junk changes (ham -> junk): %6d/%-6d (%7.3f%%) +Junk changes / total junk hits: %6d/%-6d (%7.3f%%) +", + $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ), + $r->{spam_change}, $total_spam, ( $schp or 0 ), + $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ), + $r->{junk_change}, $total_junk, ( $jchp or 0 ); + } + else { + printf " +Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%) +Spam changes / total spam hits : %6d/%-6d (%7.3f%%) +Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%) +Junk changes / total junk hits : %6d/%-6d (%7.3f%%) +", + $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ), + $r->{spam_change}, $total_spam, ( $schp or 0 ), + $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ), + $r->{junk_change}, $total_junk, ( $jchp or 0 ); + } + } + else { + print ","; + JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" ); + print ","; + JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" ); + } + } + + if ($correlations) { + + my $spam_related = GenRelated( $r->{symbols_met_spam}, $s ); + my $junk_related = GenRelated( $r->{symbols_met_junk}, $s ); + my $ham_related = GenRelated( $r->{symbols_met_ham}, $s ); + + if ( !$json ) { + print "Correlations report:\n"; + + while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) { + my $corr_prob = $r->{'hits'} / $total; + my $merged_hits = 0; + if ( $r->{symbols_met_spam}->{$cs} ) { + $merged_hits += $r->{symbols_met_spam}->{$cs}; + } + if ( $r->{symbols_met_junk}->{$cs} ) { + $merged_hits += $r->{symbols_met_junk}->{$cs}; + } + if ( $r->{symbols_met_ham}->{$cs} ) { + $merged_hits += $r->{symbols_met_ham}->{$cs}; + } + + if ( $merged_hits > 0 ) { + printf "Probability of %s when %s fires: %.3f\n", $cs, $s, + ( ( $merged_hits / $total ) / $corr_prob ); + } + } + + print "Related symbols report:\n"; + printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} ); + printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} ); + printf "Top related in ham:\n %s\n", + StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} ); + } + else { + print ","; + print "\"correllations\":{"; + + my $has_comma_ = 0; + while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) { + if ($has_comma_) { + print ","; + } + else { + $has_comma_ = 1; + } + my $corr_prob = $hits / $total; + my $sym_prob = $r->{hits} / $total; + JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" ); + } + + print "}"; + } + } + + print "}" if $json; + } + else { + print "Symbol $s has not been met\n" if !$json; + } + + print '-' x 80 . "\n" if !$json; + } + } +} + +sub Summary() { + if ( !$json ) { + print " +=== Summary ", '=' x 68, " +Messages scanned: $total"; + printf " [ %s / %s ] +", $timeStamp{'start'}, $timeStamp{'end'} + if defined $timeStamp{'start'}; + say ''; + printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action; + say ''; + printf "scan time min/avg/max = %.2f/%.2f/%.2f s +", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000 + if exists $scanTime{'min'}; + say '=' x 80; + } + else { + JsonObjectElt( "total", $total, "%d" ); + print ","; + + if ( defined $timeStamp{'start'} ) { + JsonObjectElt( "start", $timeStamp{'start'} ); + print ","; + } + + if ( defined $timeStamp{'end'} ) { + JsonObjectElt( "end", $timeStamp{'end'} ); + print ","; + } + + print "\"actions\":{"; + + my $has_comma = 0; + foreach my $a ( sort keys %action ) { + if ($has_comma) { + print ","; + } + else { + $has_comma = 1; + } + JsonObjectElt( $a, $action{$a}, "%d" ); + } + print "},"; + } +} + +sub ProcessRelated { + my ( $symbols, $target, $source ) = @_; + + foreach my $s ( @{$symbols} ) { + $s =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; + + if ( $groups{$sym_name} ) { + $sym_name = $groups{$sym_name}; + } + + next if ( $source eq $sym_name ); + + next if IsIgnored($sym_name); + + if ($2) { + $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0); + + if ( abs($sym_score) < $diff_alpha ) { + next; + } + + my $bm = $bidir_match{$sym_name}; + if ($bm) { + if ( $sym_score >= 0 ) { + $sym_name = $bm->{'spam'}; + } + else { + $sym_name = $bm->{'ham'}; + } + } + } + + if ( exists( $target->{$sym_name} ) ) { + $target->{$sym_name}++; + } + else { + $target->{$sym_name} = 1; + } + } +} + +sub ProcessLog { + my ( $ts_format, @line ) = &log_time_format($rspamd_log); + + while () { + last if eof $rspamd_log; + $_ = (@line) ? shift @line : <$rspamd_log>; + + if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) { + $enabled = 1; + } + + next if !$enabled; + + if (/^.*rspamd_task_write_log.*$/) { + &spinner; + my $ts; + if ( $ts_format eq 'syslog' ) { + $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] ); + } + elsif ( $ts_format eq 'syslog5424' ) { + /^([0-9-]+)T([0-9:]+)/; + $ts = "$1 $2"; + } + else { + $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ]; + } + + next if ( $ts lt $startTime ); + next if ( defined $endTime && $ts gt $endTime ); + + if ( $_ !~ + /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms/ ) + { + #print "BAD: $_\n"; + next; + } + + my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4; + my $scan_time = $5; + my $act = $1; + my $score = $2 * 1.0; + my $skip = 0; + + foreach my $ex (@symbols_exclude) { + my @found = grep { /^$ex/ } @symbols; + + if ( scalar(@found) > 0 ) { + $skip = 1; + last; + } + } + + next if ( $skip != 0 ); + + if ( defined( $timeStamp{'end'} ) ) { + $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} ); + } + else { + $timeStamp{'end'} = $ts; + } + + if ( defined( $timeStamp{'start'} ) ) { + $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} ); + } + else { + $timeStamp{'start'} = $ts; + } + + $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time ); + $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time ); + $scanTime{'total'} += $scan_time; + + $action{$act}++; + $total++; + + if ( $score >= $reject_score ) { + $total_spam++; + } + elsif ( $score >= $junk_score ) { + $total_junk++; + } + + my @sym_names; + + foreach my $s (@symbols_search) { + my @selected = grep /$s/, @symbols; + + if ( scalar(@selected) > 0 ) { + + foreach my $sym (@selected) { + $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; + my $orig_name = $sym_name; + + if ($2) { + $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0); + + if ( abs($sym_score) < $diff_alpha ) { + next; + } + + my $bm = $bidir_match{$sym_name}; + if ($bm) { + if ( $sym_score >= 0 ) { + $sym_name = $bm->{'spam'}; + } + else { + $sym_name = $bm->{'ham'}; + } + } + } + + next if $orig_name !~ /^$s/; + + if ( $groups{$s} ) { + + # Replace with group + $sym_name = $groups{$s}; + } + + push @sym_names, $sym_name; + + if ( !$sym_res{$sym_name} ) { + $sym_res{$sym_name} = { + hits => 0, + spam_hits => 0, + junk_hits => 0, + spam_change => 0, + junk_change => 0, + weight => 0, + corr => {}, + symbols_met_spam => {}, + symbols_met_ham => {}, + symbols_met_junk => {}, + }; + } + + my $r = $sym_res{$sym_name}; + + $r->{hits}++; + $r->{weight} += $sym_score; + my $is_spam = 0; + my $is_junk = 0; + + if ( $score >= $reject_score ) { + $is_spam = 1; + $r->{spam_hits}++; + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name ); + } + } + elsif ( $score >= $junk_score ) { + $is_junk = 1; + $r->{junk_hits}++; + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name ); + } + } + else { + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name ); + } + } + + if ( $sym_score != 0 ) { + my $score_without = $score - $sym_score; + + if ( $sym_score > 0 ) { + if ( $is_spam && $score_without < $reject_score ) { + $r->{spam_change}++; + } + if ( $is_junk && $score_without < $junk_score ) { + $r->{junk_change}++; + } + } + else { + if ( !$is_spam && $score_without >= $reject_score ) { + $r->{spam_change}++; + } + if ( !$is_junk && $score_without >= $junk_score ) { + $r->{junk_change}++; + } + } + } + } # End foreach symbols selected + } + } + + if ($correlations) { + foreach my $sym (@sym_names) { + next if IsIgnored($sym); + my $r = $sym_res{$sym}; + + foreach my $corr_sym (@sym_names) { + if ( $corr_sym ne $sym ) { + if ( $r->{'corr'}->{$corr_sym} ) { + $r->{'corr'}->{$corr_sym}++; + } + else { + $r->{'corr'}->{$corr_sym} = 1; + } + } + } + } # End of correlations check + } + } + } +} + +sub JsonObjectElt() { + my ( $k, $v ) = @_; + my $f = defined $_[2] ? $_[2] : '%s'; + + if ( $f eq "%s" ) { + $f = "\"%s\""; + } + + printf "\"%s\":$f", $k, $v; +} + +sub GetLogfilesList { + my ($dir) = @_; + opendir( my $fh, $dir ) or die $!; + + my $pattern = join( '|', keys %decompressor ); + my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/; + + # Add unnumbered logs first + my @logs = + grep { -f "$dir/$_" && !/$re/ } readdir($fh); + + # Add numbered logs + rewinddir($fh); + push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir($fh) ) ) ); + + closedir($fh); + + # Select required logs and revers their order + @logs = + reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs ); + + # Loop through array printing out filenames + print { interactive(*STDERR) } "\nLog files to process:\n"; + foreach my $file (@logs) { + print { interactive(*STDERR) } " $file\n"; + } + print { interactive(*STDERR) } "\n"; + + return @logs; +} + +sub log_time_format { + my $fh = shift; + my ( $format, $line ); + while (<$fh>) { + $line = $_; + + # 2017-08-08 00:00:01 #66984( + # 2017-08-08 00:00:01.001 #66984( + if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3,5})? #\d+\(/) { + $format = 'rspamd'; + last; + } + + # Aug 8 00:02:50 #66986( + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) { + $format = 'syslog'; + last; + } + + # Aug 8 00:02:50 hostname rspamd[66986] + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) { + $format = 'syslog'; + last; + } + + # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968] + elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) { + $format = 'syslog5424'; + last; + } + + # Skip newsyslog messages + # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) { + next; + } + + # Skip journalctl messages + # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. -- + elsif ( +/^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/ + ) + { + next; + } + else { + print "Unknown log format\n"; + exit 1; + } + } + return ( $format, $line ); +} + +sub normalized_time { + return + if !defined( $_ = shift ); + + /^\d\d(?::\d\d){0,2}$/ + ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_ + : $_; +} + +sub numeric { + $a =~ /\.(\d+)\./; + my $a_num = $1; + $b =~ /\.(\d+)\./; + my $b_num = $1; + + $a_num <=> $b_num; +} + +sub spinner { + my @spinner = qw{/ - \ |}; + return + if ( ( time - $spinner_update_time ) < 1 ); + $spinner_update_time = time; + printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ]; + select()->flush(); +} + +# Convert syslog timestamp to "ISO 8601 like" format +# using current year as syslog does not record the year (nor the timezone) +# or the last year if the guessed time is in the future. +sub syslog2iso { + my %month_map; + @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11; + + my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x; + my $epoch = + timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] ); + sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t; +} + +### Imported from IO::Interactive 1.022 Perl module +sub is_interactive { + ## no critic (ProhibitInteractiveTest) + + my ($out_handle) = ( @_, select ); # Default to default output handle + + # Not interactive if output is not to terminal... + return 0 if not -t $out_handle; + + # If *ARGV is opened, we're interactive if... + if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to + + # ...it's currently opened to the magic '-' file + return -t *STDIN if defined $ARGV && $ARGV eq '-'; + + # ...it's at end-of-file and the next file is the magic '-' file + return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV; + + # ...it's directly attached to the terminal + return -t *ARGV; + } + + # If *ARGV isn't opened, it will be interactive if *STDIN is attached + # to a terminal. + else { + return -t *STDIN; + } +} + +### Imported from IO::Interactive 1.022 Perl module +local ( *DEV_NULL, *DEV_NULL2 ); +my $dev_null; + +BEGIN { + pipe *DEV_NULL, *DEV_NULL2 + or die "Internal error: can't create null filehandle"; + $dev_null = \*DEV_NULL; +} + +### Imported from IO::Interactive 1.022 Perl module +sub interactive { + my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT + return &is_interactive ? $out_handle : $dev_null; +} + +__END__ + +=head1 NAME + +rspamd_stats - analyze Rspamd rules by parsing log files + +=head1 SYNOPSIS + +rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file] + + Options: + --log=file log file or directory to read (stdin by default) + --reject-score=score set reject threshold (15 by default) + --junk-score=score set junk score (6.0 by default) + --symbol=sym check specified symbol (perl regexps, '.*' by default) + --alpha-score=score set ignore score for symbols (0.1 by default) + --correlations enable correlations report + --nrelated=integer show that amount of related symbols (10 by default) + --search-pattern do not process input unless the desired pattern is found + --start starting time (oldest) for log parsing + --end ending time (newest) for log parsing + --num-logs=integer number of recent logfiles to analyze (all files in the directory by default) + --exclude-logs=integer number of latest logs to exclude (0 by default) + --json print json output instead of human readable + --help brief help message + --mult=sym=number multiply symbol score + --man full documentation + +=head1 OPTIONS + +=over 8 + +=item B<--log> + +Specifies log file or directory to read data from. If a directory is specified B<rspamd_stats> analyses files in the +directory including known compressed file types. Number of log files can be limited using B<--num-logs> and +B<--exclude-logs> options. This assumes that files in the log directory have B<newsyslog(8)>- or B<logrotate(8)>-like +name format with numeric indexes. Files without indexes (generally it is merely one file) are considered the most +recent and files with lower indexes are considered newer. + +=item B<--reject-score> + +Specifies the reject (spam) threshold. + +=item B<--junk-score> + +Specifies the junk (add header or rewrite subject) threshold. + +=item B<--alpha-score> + +Specifies the minimum score for a symbol to be considered by this script. + +=item B<--symbol> + +Add symbol or pattern (pcre format) to analyze. + +=item B<--num-logs> + +If set, limits number of analyzed logfiles in the directory to the specified value. + +=item B<--exclude-logs> + +Number of latest logs to exclude (0 by default). + +=item B<--correlations> + +Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations +between symbols. + +=item B<--search-pattern> + +Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position. + +=item B<--exclude> + +Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple +symbols. + +=item B<--start> + +Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used +with B<--end> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you +supply the time. + +=item B<--end> + +Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used +with B<--start> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you +supply the time. + +=item B<--mult=symbol=number> + +Multiplies score for the named symbol by the provided multiplier. + +=item B<--help> + +Print a brief help message and exits. + +=item B<--man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols: + + Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%) + Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%) + Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%) + Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%) + Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102% + Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540% + +Where there are the following attributes: + +=over 4 + +=item * + +B<Weight>: average score for a symbols + +=item * + +B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages + +=item * + +B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right): + +=over 4 + +=item 1. + +B<total symbol hits>: number of messages that has this symbol and are B<HAM> + +=item 2. + +B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count + +=item 3. + +B<total ham hits>: overall number of B<HAM> messages + +=item 4. + +B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total +number of B<HAM> messages. + +=back + +=item * + +B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class. + +=item * + +B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class. + +=item * + +B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight. + +=item * + +B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight. + +=back + +=cut |