summaryrefslogtreecommitdiffstats
path: root/utils/rspamd_stats.pl
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /utils/rspamd_stats.pl
parentInitial commit. (diff)
downloadrspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'utils/rspamd_stats.pl')
-rwxr-xr-xutils/rspamd_stats.pl1018
1 files changed, 1018 insertions, 0 deletions
diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl
new file mode 100755
index 0000000..9c5f2ac
--- /dev/null
+++ b/utils/rspamd_stats.pl
@@ -0,0 +1,1018 @@
+#!/usr/bin/env perl
+
+use 5.010;
+use Data::Dumper;
+use Getopt::Long;
+use Pod::Usage;
+use Time::Local;
+use IO::Handle;
+use warnings;
+use strict;
+
+my @symbols_search;
+my @symbols_exclude;
+my @symbols_bidirectional;
+my @symbols_groups;
+my @symbols_ignored;
+my %symbols_mult;
+my %groups;
+my $reject_score = 15.0;
+my $junk_score = 6.0;
+my $diff_alpha = 0.1;
+my $correlations = 0;
+my $nrelated = 10;
+my $log_file = "";
+my $search_pattern = "";
+my $startTime = "";
+my $endTime;
+my $num_logs;
+my $exclude_logs = 0;
+my $man = 0;
+my $json = 0;
+my $help = 0;
+
+# Associate file extensions with decompressors
+my %decompressor = (
+ 'bz2' => 'bzip2 -cd',
+ 'gz' => 'gzip -cd',
+ 'xz' => 'xz -cd',
+ 'zst' => 'zstd -cd',
+);
+
+GetOptions(
+ "reject-score|r=f" => \$reject_score,
+ "junk-score|j=f" => \$junk_score,
+ "symbol|s=s@" => \@symbols_search,
+ "symbol-bidir|S=s@" => \@symbols_bidirectional,
+ "exclude|X=s@" => \@symbols_exclude,
+ "ignore=s@" => \@symbols_ignored,
+ "group|g=s@" => \@symbols_groups,
+ "log|l=s" => \$log_file,
+ "mult=s" => \%symbols_mult,
+ "alpha-score|alpha|a=f" => \$diff_alpha,
+ "correlations|c" => \$correlations,
+ "nrelated=i" => \$nrelated,
+ "search-pattern=s" => \$search_pattern,
+ "start=s" => \$startTime,
+ "end=s" => \$endTime,
+ "num-logs|n=i" => \$num_logs,
+ "exclude-logs|x=i" => \$exclude_logs,
+ "json|j" => \$json,
+ "help|?" => \$help,
+ "man" => \$man
+) or pod2usage(2);
+
+pod2usage(1) if $help;
+pod2usage( -exitval => 0, -verbose => 2 ) if $man;
+
+# Global vars
+my $total = 0;
+my $total_spam = 0;
+my $total_junk = 0;
+my $junk_symbols = 0;
+my $spam_symbols = 0;
+my $ham_symbols = 0;
+my $ham_spam_change = 0;
+my $ham_junk_change = 0;
+my %sym_res;
+my $rspamd_log;
+my $enabled = 0;
+my $log_file_num = 1;
+my $spinner_update_time = 0;
+
+my %action;
+my %timeStamp;
+my %scanTime = (
+ max => 0,
+ total => 0,
+);
+my %bidir_match;
+
+foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) }
+
+# Convert bidirectional symbols
+foreach my $s (@symbols_bidirectional) {
+ $bidir_match{$s} = {
+ spam => "${s}_SPAM",
+ ham => "${s}_HAM",
+ };
+ push @symbols_search, $s unless grep /^$s$/, @symbols_search;
+}
+
+# Deal with groups
+my $group_id = 0;
+foreach my $g (@symbols_groups) {
+ my @symbols = split /,/, $g;
+ my $group_name = "group$group_id";
+
+ foreach my $s (@symbols) {
+ $groups{$s} = $group_name;
+ push @symbols_search, $s unless grep /^$s$/, @symbols_search;
+ }
+}
+
+@symbols_search = '.*'
+ unless @symbols_search;
+
+if ( $log_file eq '-' || $log_file eq '' ) {
+ $rspamd_log = \*STDIN;
+ &ProcessLog();
+}
+elsif ( -d "$log_file" ) {
+ my $log_dir = "$log_file";
+
+ my @logs = &GetLogfilesList($log_dir);
+
+ # Process logs
+ foreach (@logs) {
+ my $ext = (/[^.]+\.?([^.]*?)$/)[0];
+ my $dc = $decompressor{$ext} || 'cat';
+
+ open( $rspamd_log, "-|", "$dc $log_dir/$_" )
+ or die "cannot execute $dc $log_dir/$_ : $!";
+
+ printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs,
+ $_;
+ $spinner_update_time = 0; # Force spinner update
+ &spinner;
+
+ &ProcessLog;
+
+ close($rspamd_log)
+ or warn "cannot close $dc $log_dir/$_: $!";
+ }
+ print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up
+}
+else {
+ my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0];
+ my $dc = $decompressor{$ext} || 'cat';
+ open( $rspamd_log, "-|", "$dc $log_file" )
+ or die "cannot execute $dc $log_file : $!";
+ $spinner_update_time = 0; # Force spinner update
+ &spinner;
+ &ProcessLog();
+}
+
+my $total_ham = $total - ( $total_spam + $total_junk );
+
+if ($json) {
+ print "{";
+ &Summary();
+ print '"symbols":{';
+ &SymbolsStat();
+ print "}}\n";
+}
+else {
+ &SymbolsStat();
+ &Summary();
+}
+
+exit;
+
+sub IsIgnored {
+ my ($sym) = @_;
+
+ foreach my $ex (@symbols_ignored) {
+ if ( $sym =~ /^$ex$/ ) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+sub GenRelated {
+ my ( $htb, $target_sym ) = @_;
+
+ my @result;
+ my $i = 0;
+ foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) {
+ if ( $sym ne $target_sym ) {
+ my @elt = ( $sym, $htb->{$sym} );
+ push @result, \@elt;
+ $i++;
+ }
+
+ last if $i > $nrelated;
+ }
+
+ return \@result;
+}
+
+sub StringifyRelated {
+ my ( $ar, $total ) = @_;
+ return
+ join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) );
+}
+
+sub SymbolsStat {
+ if ( $total > 0 ) {
+ my $has_comma = 0;
+ while ( my ( $s, $r ) = each(%sym_res) ) {
+ if ( $r->{hits} > 0 ) {
+ my $th = $r->{hits};
+ my $sh = $r->{spam_hits};
+ my $jh = $r->{junk_hits};
+ my $hh = $r->{hits} - $sh - $jh;
+ my ( $htp, $stp, $jtp );
+ $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
+ $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
+ $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
+
+ if ($json) {
+ if ($has_comma) {
+ print ",";
+ }
+ else {
+ $has_comma = 1;
+ }
+ print "\"$s\":{";
+ JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" );
+ print ",";
+ JsonObjectElt( "hits", $th, "%d" );
+ print ",";
+ JsonObjectElt( "hits_percentage", $th / $total, "%.4f" );
+ print ",";
+ JsonObjectElt( "spam_hits", $sh, "%d" );
+ print ",";
+ JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" );
+ print ",";
+ JsonObjectElt( "ham_hits", $hh, "%d" );
+ print ",";
+ JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_hits", $jh, "%d" );
+ print ",";
+ JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" );
+ }
+ else {
+ printf "%s avg. weight %.3f, hits %d(%.3f%%):
+ Ham %7.3f%%, %6d/%-6d (%7.3f%%)
+ Spam %7.3f%%, %6d/%-6d (%7.3f%%)
+ Junk %7.3f%%, %6d/%-6d (%7.3f%%)
+", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ),
+ ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ),
+ ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ),
+ ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 );
+ }
+ my ( $schp, $jchp );
+ $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
+ $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
+
+ if ( $r->{weight} != 0 ) {
+ if ( !$json ) {
+ if ( $r->{weight} > 0 ) {
+ printf "
+Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%)
+Spam changes / total spam hits: %6d/%-6d (%7.3f%%)
+Junk changes (ham -> junk): %6d/%-6d (%7.3f%%)
+Junk changes / total junk hits: %6d/%-6d (%7.3f%%)
+",
+ $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
+ $r->{spam_change}, $total_spam, ( $schp or 0 ),
+ $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
+ $r->{junk_change}, $total_junk, ( $jchp or 0 );
+ }
+ else {
+ printf "
+Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%)
+Spam changes / total spam hits : %6d/%-6d (%7.3f%%)
+Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%)
+Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
+",
+ $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
+ $r->{spam_change}, $total_spam, ( $schp or 0 ),
+ $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
+ $r->{junk_change}, $total_junk, ( $jchp or 0 );
+ }
+ }
+ else {
+ print ",";
+ JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" );
+ }
+ }
+
+ if ($correlations) {
+
+ my $spam_related = GenRelated( $r->{symbols_met_spam}, $s );
+ my $junk_related = GenRelated( $r->{symbols_met_junk}, $s );
+ my $ham_related = GenRelated( $r->{symbols_met_ham}, $s );
+
+ if ( !$json ) {
+ print "Correlations report:\n";
+
+ while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
+ my $corr_prob = $r->{'hits'} / $total;
+ my $merged_hits = 0;
+ if ( $r->{symbols_met_spam}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_spam}->{$cs};
+ }
+ if ( $r->{symbols_met_junk}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_junk}->{$cs};
+ }
+ if ( $r->{symbols_met_ham}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_ham}->{$cs};
+ }
+
+ if ( $merged_hits > 0 ) {
+ printf "Probability of %s when %s fires: %.3f\n", $cs, $s,
+ ( ( $merged_hits / $total ) / $corr_prob );
+ }
+ }
+
+ print "Related symbols report:\n";
+ printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} );
+ printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} );
+ printf "Top related in ham:\n %s\n",
+ StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} );
+ }
+ else {
+ print ",";
+ print "\"correllations\":{";
+
+ my $has_comma_ = 0;
+ while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
+ if ($has_comma_) {
+ print ",";
+ }
+ else {
+ $has_comma_ = 1;
+ }
+ my $corr_prob = $hits / $total;
+ my $sym_prob = $r->{hits} / $total;
+ JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" );
+ }
+
+ print "}";
+ }
+ }
+
+ print "}" if $json;
+ }
+ else {
+ print "Symbol $s has not been met\n" if !$json;
+ }
+
+ print '-' x 80 . "\n" if !$json;
+ }
+ }
+}
+
+sub Summary() {
+ if ( !$json ) {
+ print "
+=== Summary ", '=' x 68, "
+Messages scanned: $total";
+ printf " [ %s / %s ]
+", $timeStamp{'start'}, $timeStamp{'end'}
+ if defined $timeStamp{'start'};
+ say '';
+ printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action;
+ say '';
+ printf "scan time min/avg/max = %.2f/%.2f/%.2f s
+", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000
+ if exists $scanTime{'min'};
+ say '=' x 80;
+ }
+ else {
+ JsonObjectElt( "total", $total, "%d" );
+ print ",";
+
+ if ( defined $timeStamp{'start'} ) {
+ JsonObjectElt( "start", $timeStamp{'start'} );
+ print ",";
+ }
+
+ if ( defined $timeStamp{'end'} ) {
+ JsonObjectElt( "end", $timeStamp{'end'} );
+ print ",";
+ }
+
+ print "\"actions\":{";
+
+ my $has_comma = 0;
+ foreach my $a ( sort keys %action ) {
+ if ($has_comma) {
+ print ",";
+ }
+ else {
+ $has_comma = 1;
+ }
+ JsonObjectElt( $a, $action{$a}, "%d" );
+ }
+ print "},";
+ }
+}
+
+sub ProcessRelated {
+ my ( $symbols, $target, $source ) = @_;
+
+ foreach my $s ( @{$symbols} ) {
+ $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
+
+ if ( $groups{$sym_name} ) {
+ $sym_name = $groups{$sym_name};
+ }
+
+ next if ( $source eq $sym_name );
+
+ next if IsIgnored($sym_name);
+
+ if ($2) {
+ $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0);
+
+ if ( abs($sym_score) < $diff_alpha ) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ( $sym_score >= 0 ) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
+
+ if ( exists( $target->{$sym_name} ) ) {
+ $target->{$sym_name}++;
+ }
+ else {
+ $target->{$sym_name} = 1;
+ }
+ }
+}
+
+sub ProcessLog {
+ my ( $ts_format, @line ) = &log_time_format($rspamd_log);
+
+ while () {
+ last if eof $rspamd_log;
+ $_ = (@line) ? shift @line : <$rspamd_log>;
+
+ if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) {
+ $enabled = 1;
+ }
+
+ next if !$enabled;
+
+ if (/^.*rspamd_task_write_log.*$/) {
+ &spinner;
+ my $ts;
+ if ( $ts_format eq 'syslog' ) {
+ $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] );
+ }
+ elsif ( $ts_format eq 'syslog5424' ) {
+ /^([0-9-]+)T([0-9:]+)/;
+ $ts = "$1 $2";
+ }
+ else {
+ $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ];
+ }
+
+ next if ( $ts lt $startTime );
+ next if ( defined $endTime && $ts gt $endTime );
+
+ if ( $_ !~
+ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms/ )
+ {
+ #print "BAD: $_\n";
+ next;
+ }
+
+ my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
+ my $scan_time = $5;
+ my $act = $1;
+ my $score = $2 * 1.0;
+ my $skip = 0;
+
+ foreach my $ex (@symbols_exclude) {
+ my @found = grep { /^$ex/ } @symbols;
+
+ if ( scalar(@found) > 0 ) {
+ $skip = 1;
+ last;
+ }
+ }
+
+ next if ( $skip != 0 );
+
+ if ( defined( $timeStamp{'end'} ) ) {
+ $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} );
+ }
+ else {
+ $timeStamp{'end'} = $ts;
+ }
+
+ if ( defined( $timeStamp{'start'} ) ) {
+ $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} );
+ }
+ else {
+ $timeStamp{'start'} = $ts;
+ }
+
+ $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
+ $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time );
+ $scanTime{'total'} += $scan_time;
+
+ $action{$act}++;
+ $total++;
+
+ if ( $score >= $reject_score ) {
+ $total_spam++;
+ }
+ elsif ( $score >= $junk_score ) {
+ $total_junk++;
+ }
+
+ my @sym_names;
+
+ foreach my $s (@symbols_search) {
+ my @selected = grep /$s/, @symbols;
+
+ if ( scalar(@selected) > 0 ) {
+
+ foreach my $sym (@selected) {
+ $sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
+ my $orig_name = $sym_name;
+
+ if ($2) {
+ $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0);
+
+ if ( abs($sym_score) < $diff_alpha ) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ( $sym_score >= 0 ) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
+
+ next if $orig_name !~ /^$s/;
+
+ if ( $groups{$s} ) {
+
+ # Replace with group
+ $sym_name = $groups{$s};
+ }
+
+ push @sym_names, $sym_name;
+
+ if ( !$sym_res{$sym_name} ) {
+ $sym_res{$sym_name} = {
+ hits => 0,
+ spam_hits => 0,
+ junk_hits => 0,
+ spam_change => 0,
+ junk_change => 0,
+ weight => 0,
+ corr => {},
+ symbols_met_spam => {},
+ symbols_met_ham => {},
+ symbols_met_junk => {},
+ };
+ }
+
+ my $r = $sym_res{$sym_name};
+
+ $r->{hits}++;
+ $r->{weight} += $sym_score;
+ my $is_spam = 0;
+ my $is_junk = 0;
+
+ if ( $score >= $reject_score ) {
+ $is_spam = 1;
+ $r->{spam_hits}++;
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name );
+ }
+ }
+ elsif ( $score >= $junk_score ) {
+ $is_junk = 1;
+ $r->{junk_hits}++;
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name );
+ }
+ }
+ else {
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name );
+ }
+ }
+
+ if ( $sym_score != 0 ) {
+ my $score_without = $score - $sym_score;
+
+ if ( $sym_score > 0 ) {
+ if ( $is_spam && $score_without < $reject_score ) {
+ $r->{spam_change}++;
+ }
+ if ( $is_junk && $score_without < $junk_score ) {
+ $r->{junk_change}++;
+ }
+ }
+ else {
+ if ( !$is_spam && $score_without >= $reject_score ) {
+ $r->{spam_change}++;
+ }
+ if ( !$is_junk && $score_without >= $junk_score ) {
+ $r->{junk_change}++;
+ }
+ }
+ }
+ } # End foreach symbols selected
+ }
+ }
+
+ if ($correlations) {
+ foreach my $sym (@sym_names) {
+ next if IsIgnored($sym);
+ my $r = $sym_res{$sym};
+
+ foreach my $corr_sym (@sym_names) {
+ if ( $corr_sym ne $sym ) {
+ if ( $r->{'corr'}->{$corr_sym} ) {
+ $r->{'corr'}->{$corr_sym}++;
+ }
+ else {
+ $r->{'corr'}->{$corr_sym} = 1;
+ }
+ }
+ }
+ } # End of correlations check
+ }
+ }
+ }
+}
+
+sub JsonObjectElt() {
+ my ( $k, $v ) = @_;
+ my $f = defined $_[2] ? $_[2] : '%s';
+
+ if ( $f eq "%s" ) {
+ $f = "\"%s\"";
+ }
+
+ printf "\"%s\":$f", $k, $v;
+}
+
+sub GetLogfilesList {
+ my ($dir) = @_;
+ opendir( my $fh, $dir ) or die $!;
+
+ my $pattern = join( '|', keys %decompressor );
+ my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
+
+ # Add unnumbered logs first
+ my @logs =
+ grep { -f "$dir/$_" && !/$re/ } readdir($fh);
+
+ # Add numbered logs
+ rewinddir($fh);
+ push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir($fh) ) ) );
+
+ closedir($fh);
+
+ # Select required logs and revers their order
+ @logs =
+ reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
+
+ # Loop through array printing out filenames
+ print { interactive(*STDERR) } "\nLog files to process:\n";
+ foreach my $file (@logs) {
+ print { interactive(*STDERR) } " $file\n";
+ }
+ print { interactive(*STDERR) } "\n";
+
+ return @logs;
+}
+
+sub log_time_format {
+ my $fh = shift;
+ my ( $format, $line );
+ while (<$fh>) {
+ $line = $_;
+
+ # 2017-08-08 00:00:01 #66984(
+ # 2017-08-08 00:00:01.001 #66984(
+ if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3,5})? #\d+\(/) {
+ $format = 'rspamd';
+ last;
+ }
+
+ # Aug 8 00:02:50 #66986(
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) {
+ $format = 'syslog';
+ last;
+ }
+
+ # Aug 8 00:02:50 hostname rspamd[66986]
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) {
+ $format = 'syslog';
+ last;
+ }
+
+ # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968]
+ elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) {
+ $format = 'syslog5424';
+ last;
+ }
+
+ # Skip newsyslog messages
+ # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) {
+ next;
+ }
+
+ # Skip journalctl messages
+ # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. --
+ elsif (
+/^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/
+ )
+ {
+ next;
+ }
+ else {
+ print "Unknown log format\n";
+ exit 1;
+ }
+ }
+ return ( $format, $line );
+}
+
+sub normalized_time {
+ return
+ if !defined( $_ = shift );
+
+ /^\d\d(?::\d\d){0,2}$/
+ ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_
+ : $_;
+}
+
+sub numeric {
+ $a =~ /\.(\d+)\./;
+ my $a_num = $1;
+ $b =~ /\.(\d+)\./;
+ my $b_num = $1;
+
+ $a_num <=> $b_num;
+}
+
+sub spinner {
+ my @spinner = qw{/ - \ |};
+ return
+ if ( ( time - $spinner_update_time ) < 1 );
+ $spinner_update_time = time;
+ printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ];
+ select()->flush();
+}
+
+# Convert syslog timestamp to "ISO 8601 like" format
+# using current year as syslog does not record the year (nor the timezone)
+# or the last year if the guessed time is in the future.
+sub syslog2iso {
+ my %month_map;
+ @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11;
+
+ my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x;
+ my $epoch =
+ timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] );
+ sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t;
+}
+
+### Imported from IO::Interactive 1.022 Perl module
+sub is_interactive {
+ ## no critic (ProhibitInteractiveTest)
+
+ my ($out_handle) = ( @_, select ); # Default to default output handle
+
+ # Not interactive if output is not to terminal...
+ return 0 if not -t $out_handle;
+
+ # If *ARGV is opened, we're interactive if...
+ if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to
+
+ # ...it's currently opened to the magic '-' file
+ return -t *STDIN if defined $ARGV && $ARGV eq '-';
+
+ # ...it's at end-of-file and the next file is the magic '-' file
+ return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV;
+
+ # ...it's directly attached to the terminal
+ return -t *ARGV;
+ }
+
+ # If *ARGV isn't opened, it will be interactive if *STDIN is attached
+ # to a terminal.
+ else {
+ return -t *STDIN;
+ }
+}
+
+### Imported from IO::Interactive 1.022 Perl module
+local ( *DEV_NULL, *DEV_NULL2 );
+my $dev_null;
+
+BEGIN {
+ pipe *DEV_NULL, *DEV_NULL2
+ or die "Internal error: can't create null filehandle";
+ $dev_null = \*DEV_NULL;
+}
+
+### Imported from IO::Interactive 1.022 Perl module
+sub interactive {
+ my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT
+ return &is_interactive ? $out_handle : $dev_null;
+}
+
+__END__
+
+=head1 NAME
+
+rspamd_stats - analyze Rspamd rules by parsing log files
+
+=head1 SYNOPSIS
+
+rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file]
+
+ Options:
+ --log=file log file or directory to read (stdin by default)
+ --reject-score=score set reject threshold (15 by default)
+ --junk-score=score set junk score (6.0 by default)
+ --symbol=sym check specified symbol (perl regexps, '.*' by default)
+ --alpha-score=score set ignore score for symbols (0.1 by default)
+ --correlations enable correlations report
+ --nrelated=integer show that amount of related symbols (10 by default)
+ --search-pattern do not process input unless the desired pattern is found
+ --start starting time (oldest) for log parsing
+ --end ending time (newest) for log parsing
+ --num-logs=integer number of recent logfiles to analyze (all files in the directory by default)
+ --exclude-logs=integer number of latest logs to exclude (0 by default)
+ --json print json output instead of human readable
+ --help brief help message
+ --mult=sym=number multiply symbol score
+ --man full documentation
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--log>
+
+Specifies log file or directory to read data from. If a directory is specified B<rspamd_stats> analyses files in the
+directory including known compressed file types. Number of log files can be limited using B<--num-logs> and
+B<--exclude-logs> options. This assumes that files in the log directory have B<newsyslog(8)>- or B<logrotate(8)>-like
+name format with numeric indexes. Files without indexes (generally it is merely one file) are considered the most
+recent and files with lower indexes are considered newer.
+
+=item B<--reject-score>
+
+Specifies the reject (spam) threshold.
+
+=item B<--junk-score>
+
+Specifies the junk (add header or rewrite subject) threshold.
+
+=item B<--alpha-score>
+
+Specifies the minimum score for a symbol to be considered by this script.
+
+=item B<--symbol>
+
+Add symbol or pattern (pcre format) to analyze.
+
+=item B<--num-logs>
+
+If set, limits number of analyzed logfiles in the directory to the specified value.
+
+=item B<--exclude-logs>
+
+Number of latest logs to exclude (0 by default).
+
+=item B<--correlations>
+
+Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations
+between symbols.
+
+=item B<--search-pattern>
+
+Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position.
+
+=item B<--exclude>
+
+Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple
+symbols.
+
+=item B<--start>
+
+Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used
+with B<--end> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you
+supply the time.
+
+=item B<--end>
+
+Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used
+with B<--start> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you
+supply the time.
+
+=item B<--mult=symbol=number>
+
+Multiplies score for the named symbol by the provided multiplier.
+
+=item B<--help>
+
+Print a brief help message and exits.
+
+=item B<--man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols:
+
+ Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%)
+ Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%)
+ Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%)
+ Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%)
+ Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102%
+ Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540%
+
+Where there are the following attributes:
+
+=over 4
+
+=item *
+
+B<Weight>: average score for a symbols
+
+=item *
+
+B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages
+
+=item *
+
+B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right):
+
+=over 4
+
+=item 1.
+
+B<total symbol hits>: number of messages that has this symbol and are B<HAM>
+
+=item 2.
+
+B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count
+
+=item 3.
+
+B<total ham hits>: overall number of B<HAM> messages
+
+=item 4.
+
+B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total
+number of B<HAM> messages.
+
+=back
+
+=item *
+
+B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class.
+
+=item *
+
+B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class.
+
+=item *
+
+B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight.
+
+=item *
+
+B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight.
+
+=back
+
+=cut