diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /utils | |
parent | Initial commit. (diff) | |
download | rspamd-upstream.tar.xz rspamd-upstream.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'utils')
-rw-r--r-- | utils/CMakeLists.txt | 22 | ||||
-rw-r--r-- | utils/asn.pl | 331 | ||||
-rw-r--r-- | utils/base64.c | 89 | ||||
-rw-r--r-- | utils/cgp_rspamd.pl | 357 | ||||
-rw-r--r-- | utils/classifier_test.pl | 539 | ||||
-rwxr-xr-x | utils/fann_train.pl | 247 | ||||
-rw-r--r-- | utils/rspamd_http_bench.c | 411 | ||||
-rw-r--r-- | utils/rspamd_http_server.c | 300 | ||||
-rwxr-xr-x | utils/rspamd_stats.pl | 1018 | ||||
-rw-r--r-- | utils/sa_trivial_convert.lua | 443 |
10 files changed, 3757 insertions, 0 deletions
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt new file mode 100644 index 0000000..f2c6c54 --- /dev/null +++ b/utils/CMakeLists.txt @@ -0,0 +1,22 @@ +SET(UTILSERVERSRC rspamd_http_server.c) +SET(UTILBENCHSRC rspamd_http_bench.c) +SET(BASE64SRC base64.c) +SET(MIMESRC mime_tool.c) + +MACRO(ADD_UTIL NAME) + ADD_EXECUTABLE("${NAME}" "${ARGN}") + SET_TARGET_PROPERTIES("${NAME}" PROPERTIES LINKER_LANGUAGE CXX) + TARGET_LINK_LIBRARIES("${NAME}" rspamd-server) + IF (ENABLE_SNOWBALL MATCHES "ON") + TARGET_LINK_LIBRARIES("${NAME}" stemmer) + ENDIF() + TARGET_LINK_LIBRARIES("${NAME}" rspamd-hiredis) + TARGET_LINK_LIBRARIES("${NAME}" ${RSPAMD_REQUIRED_LIBRARIES}) +ENDMACRO() + +IF (ENABLE_UTILS MATCHES "ON") + ADD_UTIL(rspamd-http-server ${UTILSERVERSRC}) + ADD_UTIL(rspamd-http-bench ${UTILBENCHSRC}) + ADD_UTIL(rspamd-base64 ${BASE64SRC}) + ADD_UTIL(rspamd-mime-tool ${MIMESRC}) +ENDIF() diff --git a/utils/asn.pl b/utils/asn.pl new file mode 100644 index 0000000..4d54bad --- /dev/null +++ b/utils/asn.pl @@ -0,0 +1,331 @@ +#!/usr/bin/env perl +# + +use warnings; +use strict; +use autodie; + +use File::Basename; +use File::Fetch; +use Getopt::Long; +use Pod::Usage; + +use FindBin; +use lib "$FindBin::Bin/extlib/lib/perl5"; + +use URI; + +my %config = ( + asn_sources => [ + 'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest', + 'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest', + 'http://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest', + 'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest', + 'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest' + ], + bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz'] +); + +my $download_asn = 0; +my $download_bgp = 0; +my $download_target = "./"; +my $help = 0; +my $man = 0; +my $v4 = 1; +my $v6 = 1; +my $parse = 1; +my $v4_zone = "asn.rspamd.com"; +my $v6_zone = "asn6.rspamd.com"; +my $v4_file = "asn.zone"; +my $v6_file = "asn6.zone"; +my $ns_servers = [ "asn-ns.rspamd.com", "asn-ns2.rspamd.com" ]; +my $unknown_placeholder = "--"; + +GetOptions( + "download-asn" => \$download_asn, + "download-bgp" => \$download_bgp, + "4!" => \$v4, + "6!" => \$v6, + "parse!" => \$parse, + "target=s" => \$download_target, + "zone-v4=s" => \$v4_zone, + "zone-v6=s" => \$v6_zone, + "file-v4=s" => \$v4_file, + "file-v6=s" => \$v6_file, + "ns-server=s@" => \$ns_servers, + "help|?" => \$help, + "man" => \$man, + "unknown-placeholder" => \$unknown_placeholder, +) or + pod2usage(2); + +pod2usage(1) if $help; +pod2usage(-exitval => 0, -verbose => 2) if $man; + +if ($download_asn) { + foreach my $u (@{ $config{'asn_sources'} }) { + download_file($u); + } +} + +if ($download_bgp) { + foreach my $u (@{ $config{'bgp_sources'} }) { + download_file($u); + } +} + +if (!$parse) { + exit 0; +} + +# Prefix to ASN map +my $networks = { 4 => {}, 6 => {} }; + +foreach my $u (@{ $config{'bgp_sources'} }) { + my $parsed = URI->new($u); + my $fname = $download_target . '/' . basename($parsed->path); + + use constant { + F_MARKER => 0, + F_TIMESTAMP => 1, + F_PEER_IP => 3, + F_PEER_AS => 4, + F_PREFIX => 5, + F_AS_PATH => 6, + F_ORIGIN => 7, + }; + + open(my $bgpd, '-|', "bgpdump -v -M $fname") or die "can't start bgpdump: $!"; + + while (<$bgpd>) { + chomp; + my @e = split /\|/; + if ($e[F_MARKER] ne 'TABLE_DUMP2') { + warn "bad line: $_\n"; + next; + } + + my $origin_as; + my $prefix = $e[F_PREFIX]; + my $ip_ver = 6; + + if ($prefix =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2}$/) { + $ip_ver = 4; + } + + if ($e[F_AS_PATH]) { + + # not empty AS_PATH + my @as_path = split /\s/, $e[F_AS_PATH]; + $origin_as = pop @as_path; + + if (substr($origin_as, 0, 1) eq '{') { + + # route is aggregated + if ($origin_as =~ /^{(\d+)}$/) { + + # single AS aggregated, just remove { } around + $origin_as = $1; + } else { + + # use previous AS from AS_PATH + $origin_as = pop @as_path; + } + } + + # strip bogus AS + while (is_bougus_asn($origin_as)) { + $origin_as = pop @as_path; + last if scalar @as_path == 0; + } + } + + # empty AS_PATH or all AS_PATH elements was stripped as bogus - use + # PEER_AS as origin AS + $origin_as //= $e[F_PEER_AS]; + + $networks->{$ip_ver}{$prefix} = int($origin_as); + } +} + +# Remove default routes +delete $networks->{4}{'0.0.0.0/0'}; +delete $networks->{6}{'::/0'}; + +# Now roughly detect countries +my $as_info = {}; + +# RIR statistics exchange format +# https://www.apnic.net/publications/media-library/documents/resource-guidelines/rir-statistics-exchange-format +# https://www.arin.net/knowledge/statistics/nro_extended_stats_format.pdf +# first 7 fields for this two formats are same +use constant { + F_REGISTRY => 0, # {afrinic,apnic,arin,iana,lacnic,ripencc} + F_CC => 1, # ISO 3166 2-letter country code + F_TYPE => 2, # {asn,ipv4,ipv6} + F_START => 3, + F_VALUE => 4, + F_DATE => 5, + F_STATUS => 6, +}; + +foreach my $u (@{ $config{'asn_sources'} }) { + my $parsed = URI->new($u); + my $fname = $download_target . '/' . basename($parsed->path); + open(my $fh, "<", $fname) or die "Cannot open $fname: $!"; + + while (<$fh>) { + next if /^\#/; + chomp; + my @elts = split /\|/; + + if ($elts[F_TYPE] eq 'asn' && $elts[F_START] ne '*') { + my $as_start = int($elts[F_START]); + my $as_end = $as_start + int($elts[F_VALUE]) - 1; + + for my $as ($as_start .. $as_end) { + $as_info->{$as}{'country'} = $elts[F_CC]; + $as_info->{$as}{'rir'} = $elts[F_REGISTRY]; + } + } + } +} + +# Write zone files +my $ns_list = join ' ', @{$ns_servers}; +my $zone_header = << "EOH"; +\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300 +\$NS 43200 $ns_list +EOH + +if ($v4) { + # create temp file in the same dir so we can be sure that mv is atomic + my $out_dir = dirname($v4_file); + my $out_file = basename($v4_file); + my $temp_file = "$out_dir/.$out_file.tmp"; + open my $v4_fh, '>', $temp_file; + print $v4_fh $zone_header; + + while (my ($net, $asn) = each %{ $networks->{4} }) { + my $country = $as_info->{$asn}{'country'} || $unknown_placeholder; + my $rir = $as_info->{$asn}{'rir'} || $unknown_placeholder; + + # "8.8.8.0/24 15169|8.8.8.0/24|US|arin|" for 8.8.8.8 + printf $v4_fh "%s %s|%s|%s|%s|\n", $net, $asn, $net, $country, $rir; + } + + close $v4_fh; + rename $temp_file, $v4_file; +} + +if ($v6) { + my $out_dir = dirname($v6_file); + my $out_file = basename($v6_file); + my $temp_file = "$out_dir/.$out_file.tmp"; + open my $v6_fh, '>', $temp_file; + print $v6_fh $zone_header; + + while (my ($net, $asn) = each %{ $networks->{6} }) { + my $country = $as_info->{$asn}{'country'} || $unknown_placeholder; + my $rir = $as_info->{$asn}{'rir'} || $unknown_placeholder; + + # "2606:4700:4700::/48 13335|2606:4700:4700::/48|US|arin|" for 2606:4700:4700::1111 + printf $v6_fh "%s %s|%s|%s|%s|\n", $net, $asn, $net, $country, $rir; + } + + close $v6_fh; + rename $temp_file, $v6_file; +} + +exit 0; + +######################################################################## + +sub download_file { + my ($url) = @_; + + local $File::Fetch::WARN = 0; + local $File::Fetch::TIMEOUT = 180; # connectivity to ftp.lacnic.net is bad + + my $ff = File::Fetch->new(uri => $url); + my $where = $ff->fetch(to => $download_target) or + die "$url: ", $ff->error; + + return $where; +} + +# Returns true if AS number is bogus +# e. g. a private AS. +# List of allocated and reserved AS: +# https://www.iana.org/assignments/as-numbers/as-numbers.txt +sub is_bougus_asn { + my $as = shift; + + # 64496-64511 Reserved for use in documentation and sample code + # 64512-65534 Designated for private use + # 65535 Reserved + # 65536-65551 Reserved for use in documentation and sample code + # 65552-131071 Reserved + return 1 if $as >= 64496 && $as <= 131071; + + # Reserved (RFC6996, RFC7300, RFC7607) + return 1 if $as == 0 || $as >= 4200000000; + + return 0; +} + +__END__ + +=head1 NAME + +asn.pl - download and parse ASN data for Rspamd + +=head1 SYNOPSIS + +asn.pl [options] + + Options: + --download-asn Download ASN data from RIRs + --download-bgp Download BGP full view dump from RIPE RIS + --target Where to download files (default: current dir) + --zone-v4 IPv4 zone (default: asn.rspamd.com) + --zone-v6 IPv6 zone (default: asn6.rspamd.com) + --file-v4 IPv4 zone file (default: ./asn.zone) + --file-v6 IPv6 zone (default: ./asn6.zone) + --unknown-placeholder Placeholder for unknown elements (default: --) + --help Brief help message + --man Full documentation + +=head1 OPTIONS + +=over 8 + +=item B<--download-asn> + +Download ASN data from RIR. + +=item B<--download-bgp> + +Download GeoIP data from Ripe + +=item B<--target> + +Specifies where to download files. + +=item B<--help> + +Print a brief help message and exits. + +=item B<--man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<asn.pl> is intended to download ASN data and GeoIP data and create a rbldnsd zone. + +=cut + +# vim: et:ts=4:sw=4 diff --git a/utils/base64.c b/utils/base64.c new file mode 100644 index 0000000..d1202db --- /dev/null +++ b/utils/base64.c @@ -0,0 +1,89 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "printf.h" +#include "util.h" +#include "cryptobox.h" +#include "unix-std.h" + +static gdouble total_time = 0; + + +static void +rspamd_process_file(const gchar *fname, gint decode) +{ + gint fd; + gpointer map; + struct stat st; + guint8 *dest; + gsize destlen; + + fd = open(fname, O_RDONLY); + + if (fd == -1) { + rspamd_fprintf(stderr, "cannot open %s: %s", fname, strerror(errno)); + exit(EXIT_FAILURE); + } + + if (fstat(fd, &st) == -1) { + rspamd_fprintf(stderr, "cannot stat %s: %s", fname, strerror(errno)); + exit(EXIT_FAILURE); + } + + map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + close(fd); + + if (map == MAP_FAILED) { + rspamd_fprintf(stderr, "cannot mmap %s: %s", fname, strerror(errno)); + exit(EXIT_FAILURE); + } + + if (decode) { + destlen = st.st_size / 4 * 3 + 10; + dest = g_malloc(destlen); + rspamd_cryptobox_base64_decode(map, st.st_size, dest, &destlen); + } + else { + dest = rspamd_encode_base64(map, st.st_size, 80, &destlen); + } + + rspamd_printf("%*s", (gint) destlen, dest); + g_free(dest); + + munmap(map, st.st_size); +} + +int main(int argc, char **argv) +{ + gint i, start = 1, decode = 0; + + if (argc > 2 && *argv[1] == '-') { + start = 2; + + if (argv[1][1] == 'd') { + decode = 1; + } + } + + for (i = start; i < argc; i++) { + if (argv[i]) { + rspamd_process_file(argv[i], decode); + } + } + + return 0; +} diff --git a/utils/cgp_rspamd.pl b/utils/cgp_rspamd.pl new file mode 100644 index 0000000..6898d26 --- /dev/null +++ b/utils/cgp_rspamd.pl @@ -0,0 +1,357 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use JSON::XS; +use AnyEvent; +use AnyEvent::HTTP; +use AnyEvent::IO; +use EV; +use Pod::Usage; +use Getopt::Long; +use File::stat; + +my $rspamd_host = "localhost:11333"; +my $man = 0; +my $help = 0; +my $local = 0; +my $header = "X-Spam: yes"; +my $max_size = 10 * 1024 * 1024; # 10 MB +my $request_timeout = 15; # 15 seconds by default +my $reject_message = "Spam message rejected"; + +GetOptions( + "host=s" => \$rspamd_host, + "header=s" => \$header, + "reject-message=s" => \$reject_message, + "max-size=i" => \$max_size, + "timeout=f" => \$request_timeout, + "help|?" => \$help, + "man" => \$man +) or pod2usage(2); + +pod2usage(1) if $help; +pod2usage( -exitval => 0, -verbose => 2 ) if $man; + +my $main_domain = cgp_main_domain(); +my $scanned = 0; + +# Turn off bufferization as required by CGP +$| = 1; + +sub cgp_main_domain { + if ( open( my $fh, 'Settings/Main.settings' ) ) { + while (<$fh>) { + if (/^\s+DomainName\s+=\s+([^;]+);/) { + return $1; + } + } + } +} + +sub cgp_string { + my ($in) = @_; + + $in =~ s/\"/\\"/g; + $in =~ s/\n/\\n/gms; + $in =~ s/\r/\\r/mgs; + $in =~ s/\t/ /g; + + return "\"$in\""; +} + +sub rspamd_scan { + my ( $tag, $file ) = @_; + + my $http_callback = sub { + my ( $body, $hdr ) = @_; + + if ( $hdr && $hdr->{Status} =~ /^2/ ) { + my $js = eval('decode_json($body)'); + $scanned++; + + if ( !$js ) { + print "* Rspamd: Bad response for $file: invalid JSON: parse error\n"; + print "$tag FAILURE\n"; + } + else { + my $def = $js; + my $headers = ""; + + if ( !$def ) { + print "* Rspamd: Bad response for $file: invalid JSON: default is missing\n"; + print "$tag FAILURE\n"; + } + else { + my $action = $def->{'action'}; + my $id = $js->{'message-id'}; + + my $symbols = ""; + while ( my ( $k, $s ) = each( %{ $def->{'symbols'} } ) ) { + $symbols .= sprintf "%s(%.2f);", $k, $s->{'score'}; + } + + printf + "* Rspamd: Scanned %s; id: <%s>; Score: %.2f / %.2f; Symbols: [%s]\n", + $file, $id, $def->{'score'}, $def->{'required_score'}, $symbols; + + if ( $js->{'dkim-signature'} ) { + $headers .= "DKIM-Signature: " . $js->{'dkim-signature'}; + } + + if ( $js->{'milter'} ) { + my $block = $js->{'milter'}; + + if ( $block->{'add_headers'} ) { + while ( my ( $h, $v ) = each( %{ $block->{'add_headers'} } ) ) { + if ( ref($v) eq 'HASH' ) { + if ( $headers eq "" ) { + $headers .= "$h: $v->{value}"; + } + else { + $headers .= "\\e$h: $v->{value}"; + } + } + else { + if ( $headers eq "" ) { + $headers .= "$h: $v"; + } + else { + $headers .= "\\e$h: $v"; + } + } + } + } + } + + if ( $action eq 'reject' ) { + print "$tag DISCARD\n"; + return; + } + elsif ( $action eq 'add header' || $action eq 'rewrite subject' ) { + if ( $headers eq "" ) { + $headers .= "$header"; + } + else { + $headers .= "\\e$header"; + } + } + elsif ( $action eq 'soft reject' ) { + print "$tag REJECTED Try again later\n"; + return; + } + + if ( $headers eq "" ) { + print "$tag OK\n"; + } + else { + print "$tag ADDHEADER " . cgp_string($headers) . " OK\n"; + } + } + } + } + else { + if ($hdr) { + print "* Rspamd: Bad response for $file: HTTP error: $hdr->{Status} $hdr->{Reason}\n"; + } + else { + print "* Rspamd: Bad response for $file: IO error: $!\n"; + } + print "$tag FAILURE\n"; + } + }; + + if ($local) { + + # Use file scan + # XXX: not implemented now due to CGP queue format + http_get( + "http://$rspamd_host/symbols?file=$file", + timeout => $request_timeout, + $http_callback + ); + } + else { + my $sb = stat($file); + + if ( !$sb || $sb->size > $max_size ) { + if ($sb) { + print "* File $file is too large: " . $sb->size . "\n$tag FAILURE\n"; + + } + else { + print "* Cannot stat $file: $!\n$tag FAILURE\n"; + } + return; + } + aio_load( + $file, + sub { + my ($data) = @_; + + if ( !$data ) { + print "* Cannot open $file: $!\n$tag FAILURE\n"; + return; + } + + # Parse CGP format + $data =~ s/^((?:[^\n]*\n)*?)\n(.*)$/$2/ms; + my @envelope = split /\n/, $1; + chomp(@envelope); + my $from; + my @rcpts; + my $ip; + my $user; + + foreach my $elt (@envelope) { + if ( $elt =~ /^P\s[^<]*(<[^>]*>).*$/ ) { + $from = $1; + } + elsif ( $elt =~ /^R\s[^<]*(<[^>]*>).*$/ ) { + push @rcpts, $1; + } + elsif ( $elt =~ /^S (?:<([^>]+)> )?(?:SMTP|HTTPU?|AIRSYNC|XIMSS) \[([0-9a-f.:]+)\]/ ) { + if ($1) { + $user = $1; + } + if ($2) { + $ip = $2; + } + } + elsif ( $elt =~ /^S (?:<([^>]+)> )?(?:DSN|GROUP|LIST|PBX|PIPE|RULE) \[0\.0\.0\.0\]/ ) { + if ($1) { + $user = $1; + } + $ip = '127.2.4.7'; + } + } + + my $headers = {}; + if ( $file =~ /\/([^\/.]+)\.msg$/ ) { + $headers->{'Queue-ID'} = $1; + } + if ($from) { + $headers->{From} = $from; + } + if ( scalar(@rcpts) > 0 ) { + + # XXX: Anyevent cannot parse headers with multiple values + $headers->{Rcpt} = join( ',', @rcpts ); + } + if ($ip) { + $headers->{IP} = $ip; + } + if ($user) { + $headers->{User} = $user; + } + if ($main_domain) { + $headers->{'MTA-Tag'} = $main_domain; + } + + http_post( + "http://$rspamd_host/checkv2", $data, + timeout => $request_timeout, + headers => $headers, + $http_callback + ); + } + ); + } +} + +# Show informational message +print "* Rspamd CGP filter has been started\n"; + +my $w = AnyEvent->io( + fh => \*STDIN, + poll => 'r', + cb => sub { + chomp( my $input = <STDIN> ); + + if ( $input =~ /^(\d+)\s+(\S+)(\s+(\S+)\s*)?$/ ) { + my $tag = $1; + my $cmd = $2; + + if ( $cmd eq "INTF" ) { + print "$input\n"; + } + elsif ( $cmd eq "FILE" && $4 ) { + my $file = $4; + print "* Scanning file $file\n"; + rspamd_scan $tag, $file; + } + elsif ( $cmd eq "QUIT" ) { + print "* Terminating after scanning of $scanned files\n"; + print "$tag OK\n"; + exit 0; + } + else { + print "* Unknown command $cmd\n"; + print "$tag FAILURE\n"; + } + } + } +); + +EV::run; + +__END__ + +=head1 NAME + +cgp_rspamd - implements Rspamd filter for CommunigatePro MTA + +=head1 SYNOPSIS + +cgp_rspamd [options] + + Options: + --host=hostport Rspamd host to connect (localhost:11333 by default) + --header Add specific header for a spam message ("X-Spam: yes" by default) + --reject-message Rejection message for spam mail ("Spam message rejected" by default) + --timeout Timeout to read response from Rspamd (15 seconds by default) + --max-size Maximum size of message to scan (10 megabytes by default) + --help brief help message + --man full documentation + +=head1 OPTIONS + +=over 8 + +=item B<--host> + +Specifies Rspamd host to use for scanning + +=item B<--header> + +Specifies the header that should be added when Rspamd action is B<add header> or B<rewrite subject>. + +=item B<--reject-message> + +Specifies the rejection message for spam. + +=item B<--timeout> + +Sets timeout in seconds for waiting Rspamd reply for a message. + +=item B<--max-size> + +Define the maximum messages size to be processed by Rspamd in bytes. + +=item B<--help> + +Print a brief help message and exits. + +=item B<--man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<cgp_rspamd> is intended to scan messages processed with B<CommunigatePro> MTA on some Rspamd scanner. It reads +standard input and parses CGP helpers protocol. On scan requests, this filter can query Rspamd to process a message. +B<cgp_rspamd> can tell CGP to add header or reject SPAM messages depending on Rspamd scan result. + +=cut diff --git a/utils/classifier_test.pl b/utils/classifier_test.pl new file mode 100644 index 0000000..238417f --- /dev/null +++ b/utils/classifier_test.pl @@ -0,0 +1,539 @@ +#!/usr/bin/env perl + +use warnings; +use strict; +use Pod::Usage; +use Getopt::Long; +use Time::HiRes qw(gettimeofday tv_interval); +use JSON::XS; +use String::ShellQuote; +use FileHandle; +use IPC::Open2; +use Data::Dumper; + +my $spam_dir; +my $ham_dir; +my $parallel = 1; +my $classifier = "bayes"; +my $spam_symbol = "BAYES_SPAM"; +my $ham_symbol = "BAYES_HAM"; +my $timeout = 10; +my $rspamc = $ENV{'RSPAMC'} || "rspamc"; +my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter"; +my $dspam = $ENV{'DSPAM'} || "dspam"; +my $train_fraction = 0.5; +my $use_bogofilter = 0; +my $use_dspam = 0; +my $check_only = 0; +my $rspamc_prob_trigger = 95; +my $man; +my $help; + +GetOptions( + "spam|s=s" => \$spam_dir, + "ham|h=s" => \$ham_dir, + "spam-symbol=s" => \$spam_symbol, + "ham-symbol=s" => \$ham_symbol, + "classifier|c=s" => \$classifier, + "timeout|t=f" => \$timeout, + "parallel|p=i" => \$parallel, + "train-fraction|t=f" => \$train_fraction, + "bogofilter|b" => \$use_bogofilter, + "dspam|d" => \$use_dspam, + "check-only" => \$check_only, + "help|?" => \$help, + "man" => \$man +) or pod2usage(2); + +pod2usage(1) if $help; +pod2usage( -exitval => 0, -verbose => 2 ) if $man; + +sub read_dir_files { + my ( $dir, $target ) = @_; + opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; + while ( my $file = readdir $dh ) { + if ( -f "$dir/$file" ) { + push @{$target}, "$dir/$file"; + } + } +} + +sub shuffle_array { + my ($ar) = @_; + + for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { + if ( $i > 1 ) { + my $sel = int( rand( $i - 1 ) ); + ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); + } + } +} + +sub learn_rspamc { + my ( $files, $spam ) = @_; + my $processed = 0; + + my $cmd = $spam ? "learn_spam" : "learn_ham"; + my $args_quoted = shell_quote @{$files}; + open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) + or die "cannot spawn $rspamc: $!"; + + while (<$p>) { + my $res = eval('decode_json($_)'); + if ( $res && $res->{'success'} ) { + $processed++; + } + } + + return $processed; +} + +sub learn_bogofilter { + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "-s" : "-n"; + `$bogofilter -I $args_quoted $fl`; + if ( $? == 0 ) { + $processed++; + } + } + + return $processed; +} + +sub learn_dspam { + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "--class=spam" : "--class=innocent"; + open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) + or die "cannot run $dspam: $!"; + + open( my $inp, "< $f" ); + while (<$inp>) { + print $p $_; + } + } + + return $processed; +} + +sub learn_samples { + my ( $ar_ham, $ar_spam ) = @_; + my $len; + my $processed = 0; + my $total = 0; + my $learn_func; + + my @files_spam; + my @files_ham; + + if ($use_dspam) { + $learn_func = \&learn_dspam; + } + elsif ($use_bogofilter) { + $learn_func = \&learn_bogofilter; + } + else { + $learn_func = \&learn_rspamc; + } + + $len = int( scalar @{$ar_ham} * $train_fraction ); + my @cur_vec; + + # Shuffle spam and ham samples + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_ham}[$i]; + push @files_ham, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_ham}[$i]; + } + } + + $len = int( scalar @{$ar_spam} * $train_fraction ); + @cur_vec = (); + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_spam}[$i]; + push @files_spam, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_spam}[$i]; + } + } + + for ( my $i = 0 ; $i < $total ; $i++ ) { + my $args; + my $spam; + + if ( $i % 2 == 0 ) { + $args = pop @files_spam; + + if ( !$args ) { + $args = pop @files_ham; + $spam = 0; + } + else { + $spam = 1; + } + } + else { + $args = pop @files_ham; + if ( !$args ) { + $args = pop @files_spam; + $spam = 1; + } + else { + $spam = 0; + } + } + + my $r = $learn_func->( $args, $spam ); + if ($r) { + $processed += $r; + } + } + + return $processed; +} + +sub check_rspamc { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + + my $args_quoted = shell_quote @{$files}; + my $processed = 0; + + open( + my $p, +"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" + ) or die "cannot spawn $rspamc: $!"; + + while (<$p>) { + my $res = eval('decode_json($_)'); + if ( $res && $res->{'default'} ) { + $processed++; + + if ($spam) { + if ( $res->{'default'}->{$ham_symbol} ) { + my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$spam_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $res->{'default'}->{$spam_symbol} ) { + my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$ham_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + + return $processed; +} + +sub check_bogofilter { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + + open( my $p, "$bogofilter -t -I $args_quoted |" ) + or die "cannot spawn $bogofilter: $!"; + + while (<$p>) { + if ( $_ =~ /^([SHU])\s+.*$/ ) { + $processed++; + + if ($spam) { + if ( $1 eq 'H' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'S' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + } + + return $processed; +} + +sub check_dspam { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + + my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); + open( my $inp, "< $f" ); + while (<$inp>) { + print Writer $_; + } + close Writer; + + while (<Reader>) { + if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { + $processed++; + my $percentage = int( $2 * 100.0 ); + + if ($spam) { + if ( $1 eq 'Innocent' ) { + if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Spam' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'Spam' ) { + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Innocent' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + close Reader; + waitpid( $pid, 0 ); + } + + return $processed; +} + +sub cross_validate { + my ($hr) = @_; + my $args = ""; + my $processed = 0; + my $fp_spam = 0; + my $fn_spam = 0; + my $fp_ham = 0; + my $fn_ham = 0; + my $total_spam = 0; + my $total_ham = 0; + my $detected_spam = 0; + my $detected_ham = 0; + my $i = 0; + my $len = scalar keys %{$hr}; + my @files_spam; + my @files_ham; + my @cur_spam; + my @cur_ham; + my $check_func; + + if ($use_dspam) { + $check_func = \&check_dspam; + } + elsif ($use_bogofilter) { + $check_func = \&check_bogofilter; + } + else { + $check_func = \&check_rspamc; + } + + while ( my ( $fn, $spam ) = each( %{$hr} ) ) { + if ($spam) { + if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { + push @cur_spam, $fn; + push @files_spam, [@cur_spam]; + @cur_spam = (); + } + else { + push @cur_spam, $fn; + } + } + else { + if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { + push @cur_ham, $fn; + push @files_ham, [@cur_ham]; + @cur_ham = (); + } + else { + push @cur_ham, $fn; + } + } + } + + shuffle_array( \@files_spam ); + + foreach my $fn (@files_spam) { + my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); + $total_spam += $r; + $processed += $r; + } + + shuffle_array( \@files_ham ); + + foreach my $fn (@files_ham) { + my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); + $total_ham += $r; + $processed += $r; + } + + printf "Scanned %d messages +%d spam messages (%d detected) +%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; + + printf "\nHam FP rate: %.2f%% (%d messages) +Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; + + printf "\nSpam FP rate: %.2f%% (%d messages) +Spam FN rate: %.2f%% (%d messages)\n", + $fp_spam / $total_spam * 100.0, $fp_spam, + $fn_spam / $total_spam * 100.0, $fn_spam; +} + +if ( !$spam_dir || !$ham_dir ) { + die "spam or/and ham directories are not specified"; +} + +my @spam_samples; +my @ham_samples; + +read_dir_files( $spam_dir, \@spam_samples ); +read_dir_files( $ham_dir, \@ham_samples ); +shuffle_array( \@spam_samples ); +shuffle_array( \@ham_samples ); + +if ( !$check_only ) { + my $learned = 0; + my $t0 = [gettimeofday]; + $learned = learn_samples( \@ham_samples, \@spam_samples ); + my $t1 = [gettimeofday]; + + printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); +} + +my %validation_set; +my $len = int( scalar @spam_samples * $train_fraction ); +for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { + $validation_set{ $spam_samples[$i] } = 1; +} + +$len = int( scalar @ham_samples * $train_fraction ); +for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { + $validation_set{ $ham_samples[$i] } = 0; +} + +cross_validate( \%validation_set ); + +__END__ + +=head1 NAME + +classifier_test.pl - test various parameters for a classifier + +=head1 SYNOPSIS + +classifier_test.pl [options] + + Options: + --spam Directory with spam files + --ham Directory with ham files + --spam-symbol Symbol for spam (default: BAYES_SPAM) + --ham-symbol Symbol for ham (default: BAYES_HAM) + --classifier Classifier to test (default: bayes) + --timeout Timeout for rspamc (default: 10) + --parallel Parallel execution (default: 1) + --help Brief help message + --man Full documentation + +=head1 OPTIONS + +=over 8 + +=item B<--spam> + +Directory with spam files. + +=item B<--ham> + +Directory with ham files. + +=item B<--classifier> + +Specifies classifier name to test. + +=item B<--help> + +Print a brief help message and exits. + +=item B<--man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<classifier_test.pl> is intended to test Rspamd classifier for false positives, false negatives and other parameters. +It uses half of the corpus for training and half for cross-validation. + +=cut diff --git a/utils/fann_train.pl b/utils/fann_train.pl new file mode 100755 index 0000000..2ce422e --- /dev/null +++ b/utils/fann_train.pl @@ -0,0 +1,247 @@ +#!/usr/bin/env perl + +# This script is a very simple prototype to learn fann from rspamd logs +# For now, it is intended for internal use only + +use strict; +use warnings FATAL => 'all'; +use AI::FANN qw(:all); +use Getopt::Std; + +my %sym_idx; # Symbols by index +my %sym_names; # Symbols by name +my $num = 1; # Number of symbols +my @spam; +my @ham; +my $max_samples = -1; +my $split = 1; +my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN> +my $score_spam = 12; +my $score_ham = -6; + +sub process { + my ( $input, $spam, $ham ) = @_; + my $samples = 0; + + while (<$input>) { + if ( !$preprocessed ) { + if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) { + if ( $1 > $score_spam ) { + $_ = "$1:1: $3"; + } + elsif ( $1 < $score_ham ) { + $_ = "$1:0: $3\n"; + } + else { + # Out of boundary + next; + } + } + else { + # Not our log message + next; + } + } + + $_ =~ /^(-?\d+\.?\d*):([01]):\s*(\S.*)$/; + + my $is_spam = 0; + + if ( $2 == 1 ) { + $is_spam = 1; + } + + my @ar = split /,/, $3; + my %sample; + + foreach my $sym (@ar) { + chomp $sym; + if ( !$sym_idx{$sym} ) { + $sym_idx{$sym} = $num; + $sym_names{$num} = $sym; + $num++; + } + + $sample{ $sym_idx{$sym} } = 1; + } + + if ($is_spam) { + push @{$spam}, \%sample; + } + else { + push @{$ham}, \%sample; + } + + $samples++; + if ( $max_samples > 0 && $samples > $max_samples ) { + return; + } + } +} + +# Shuffle array +sub fisher_yates_shuffle { + my $array = shift; + my $i = @$array; + + while ( --$i ) { + my $j = int rand( $i + 1 ); + @$array[ $i, $j ] = @$array[ $j, $i ]; + } +} + +# Train network +sub train { + my ( $ann, $sample, $result ) = @_; + + my @row; + + for ( my $i = 1 ; $i < $num ; $i++ ) { + if ( $sample->{$i} ) { + push @row, 1; + } + else { + push @row, 0; + } + } + + #print "@row -> @{$result}\n"; + + $ann->train( \@row, \@{$result} ); +} + +sub test { + my ( $ann, $sample ) = @_; + + my @row; + + for ( my $i = 1 ; $i < $num ; $i++ ) { + if ( $sample->{$i} ) { + push @row, 1; + } + else { + push @row, 0; + } + } + + my $ret = $ann->run( \@row ); + + return $ret; +} + +my %opts; +getopts( 'o:i:s:n:t:hpS:H:', \%opts ); + +if ( $opts{'h'} ) { + print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n"; + exit; +} + +my $input = *STDIN; + +if ( $opts{'i'} ) { + open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}"; +} + +if ( $opts{'n'} ) { + $max_samples = $opts{'n'}; +} + +if ( $opts{'t'} ) { + + # Test split + $split = $opts{'t'}; +} +if ( $opts{'p'} ) { + $preprocessed = 1; +} + +if ( $opts{'H'} ) { + $score_ham = $opts{'H'}; +} + +if ( $opts{'S'} ) { + $score_spam = $opts{'S'}; +} + +# ham_prob, spam_prob +my @spam_out = (1); +my @ham_out = (0); + +process( $input, \@spam, \@ham ); +fisher_yates_shuffle( \@spam ); +fisher_yates_shuffle( \@ham ); + +my $nspam = int( scalar(@spam) / $split ); +my $nham = int( scalar(@ham) / $split ); + +my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 ); + +my @train_data; + +# Train ANN +for ( my $i = 0 ; $i < $nham ; $i++ ) { + push @train_data, [ $ham[$i], \@ham_out ]; +} + +for ( my $i = 0 ; $i < $nspam ; $i++ ) { + push @train_data, [ $spam[$i], \@spam_out ]; +} + +fisher_yates_shuffle( \@train_data ); + +foreach my $train_row (@train_data) { + train( $ann, @{$train_row}[0], @{$train_row}[1] ); +} + +print "Trained $nspam SPAM and $nham HAM samples\n"; + +# Now run fann +if ( $split > 1 ) { + my $sample = 0.0; + my $correct = 0.0; + for ( my $i = $nham ; $i < $nham * $split ; $i++ ) { + my $ret = test( $ann, $ham[$i] ); + + #print "@{$ret}\n"; + if ( @{$ret}[0] < 0.5 ) { + $correct++; + } + $sample++; + } + + print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n"; + + $sample = 0.0; + $correct = 0.0; + + for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) { + my $ret = test( $ann, $spam[$i] ); + + #print "@{$ret}\n"; + if ( @{$ret}[0] > 0.5 ) { + $correct++; + } + $sample++; + } + + print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n"; +} + +if ( $opts{'o'} ) { + $ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}"; +} + +if ( $opts{'s'} ) { + open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}"; + print $scores "{"; + for ( my $i = 1 ; $i < $num ; $i++ ) { + my $n = $i - 1; + if ( $i != $num - 1 ) { + print $scores "\"$sym_names{$i}\":$n,"; + } + else { + print $scores "\"$sym_names{$i}\":$n}\n"; + } + } +} diff --git a/utils/rspamd_http_bench.c b/utils/rspamd_http_bench.c new file mode 100644 index 0000000..232fc8a --- /dev/null +++ b/utils/rspamd_http_bench.c @@ -0,0 +1,411 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "util.h" +#include "libutil/http.h" +#include "libutil/http_private.h" +#include "ottery.h" +#include "cryptobox.h" +#include "unix-std.h" +#include <math.h> +#include <netinet/tcp.h> + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +static guint port = 43000; +static gchar *host = "127.0.0.1"; +static gchar *server_key = NULL; +static guint cache_size = 10; +static guint nworkers = 1; +static gboolean openssl_mode = FALSE; +static guint file_size = 500; +static guint pconns = 100; +static gdouble test_time = 10.0; +static gchar *latencies_file = NULL; +static gboolean csv_output = FALSE; + +/* Dynamic vars */ +static rspamd_inet_addr_t *addr; +static guint32 workers_left = 0; +static guint32 *conns_done = NULL; +static const guint store_latencies = 1000; +static guint32 conns_pending = 0; + +static GOptionEntry entries[] = { + {"port", 'p', 0, G_OPTION_ARG_INT, &port, + "Port number (default: 43000)", NULL}, + {"cache", 'c', 0, G_OPTION_ARG_INT, &cache_size, + "Keys cache size (default: 10)", NULL}, + {"workers", 'n', 0, G_OPTION_ARG_INT, &nworkers, + "Number of workers to start (default: 1)", NULL}, + {"size", 's', 0, G_OPTION_ARG_INT, &file_size, + "Size of payload to transfer (default: 500)", NULL}, + {"conns", 'C', 0, G_OPTION_ARG_INT, &pconns, + "Number of parallel connections (default: 100)", NULL}, + {"time", 't', 0, G_OPTION_ARG_DOUBLE, &test_time, + "Time to run tests (default: 10.0 sec)", NULL}, + {"openssl", 'o', 0, G_OPTION_ARG_NONE, &openssl_mode, + "Use openssl crypto", NULL}, + {"host", 'h', 0, G_OPTION_ARG_STRING, &host, + "Connect to the specified host (default: localhost)", NULL}, + {"key", 'k', 0, G_OPTION_ARG_STRING, &server_key, + "Use the specified key (base32 encoded)", NULL}, + {"latency", 'l', 0, G_OPTION_ARG_FILENAME, &latencies_file, + "Write latencies to the specified file", NULL}, + {"csv", 0, 0, G_OPTION_ARG_NONE, &csv_output, + "Output CSV", NULL}, + {NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL}}; + +struct lat_elt { + gdouble lat; + guchar checked; +}; + +static struct lat_elt *latencies; + +static gint +rspamd_client_body(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *chunk, gsize len) +{ + g_assert(chunk[0] == '\0'); + + return 0; +} + +struct client_cbdata { + struct lat_elt *lat; + guint32 *wconns; + gdouble ts; + struct ev_loop *ev_base; +}; + +static void +rspamd_client_err(struct rspamd_http_connection *conn, GError *err) +{ + msg_info("abnormally closing connection from: error: %s", + err->message); + + g_assert(0); + close(conn->fd); + rspamd_http_connection_unref(conn); +} + +static gint +rspamd_client_finish(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg) +{ + struct client_cbdata *cb = conn->ud; + + cb->lat->lat = rspamd_get_ticks() - cb->ts; + cb->lat->checked = TRUE; + (*cb->wconns)++; + conns_pending--; + close(conn->fd); + rspamd_http_connection_unref(conn); + g_free(cb); + + if (conns_pending == 0) { + event_base_loopexit(cb->ev_base, NULL); + } + + return 0; +} + +static void +rspamd_http_client_func(struct ev_loop *ev_base, struct lat_elt *latency, + guint32 *wconns, + struct rspamd_cryptobox_pubkey *peer_key, + struct rspamd_cryptobox_keypair *client_key, + struct rspamd_keypair_cache *c) +{ + struct rspamd_http_message *msg; + struct rspamd_http_connection *conn; + gchar urlbuf[PATH_MAX]; + struct client_cbdata *cb; + gint fd, flags; + + fd = rspamd_inet_address_connect(addr, SOCK_STREAM, TRUE); + g_assert(fd != -1); + flags = 1; + (void) setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &flags, sizeof(flags)); + conn = rspamd_http_connection_new(rspamd_client_body, + rspamd_client_err, + rspamd_client_finish, + RSPAMD_HTTP_CLIENT_SIMPLE, + RSPAMD_HTTP_CLIENT, + c, + NULL); + rspamd_snprintf(urlbuf, sizeof(urlbuf), "http://%s/%d", host, file_size); + msg = rspamd_http_message_from_url(urlbuf); + + g_assert(conn != NULL && msg != NULL); + + if (peer_key != NULL) { + g_assert(client_key != NULL); + rspamd_http_connection_set_key(conn, client_key); + msg->peer_key = rspamd_pubkey_ref(peer_key); + } + + cb = g_malloc(sizeof(*cb)); + cb->ts = rspamd_get_ticks(); + cb->lat = latency; + cb->ev_base = ev_base; + cb->wconns = wconns; + latency->checked = FALSE; + rspamd_http_connection_write_message(conn, msg, NULL, NULL, cb, + fd, NULL, ev_base); +} + +static void +rspamd_worker_func(struct lat_elt *plat, guint32 *wconns) +{ + guint i, j; + struct ev_loop *ev_base; + struct itimerval itv; + struct rspamd_keypair_cache *c = NULL; + struct rspamd_cryptobox_keypair *client_key = NULL; + struct rspamd_cryptobox_pubkey *peer_key = NULL; + + if (server_key) { + peer_key = rspamd_pubkey_from_base32(server_key, 0, RSPAMD_KEYPAIR_KEX, + openssl_mode ? RSPAMD_CRYPTOBOX_MODE_NIST : RSPAMD_CRYPTOBOX_MODE_25519); + g_assert(peer_key != NULL); + client_key = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX, + openssl_mode ? RSPAMD_CRYPTOBOX_MODE_NIST : RSPAMD_CRYPTOBOX_MODE_25519); + + if (cache_size > 0) { + c = rspamd_keypair_cache_new(cache_size); + } + } + + memset(&itv, 0, sizeof(itv)); + double_to_tv(test_time, &itv.it_value); + + ev_base = event_init(); + g_assert(setitimer(ITIMER_REAL, &itv, NULL) != -1); + + for (i = 0;; i = (i + 1) % store_latencies) { + for (j = 0; j < pconns; j++) { + rspamd_http_client_func(ev_base, &plat[i * pconns + j], + wconns, peer_key, client_key, c); + } + + conns_pending = pconns; + + event_base_loop(ev_base, 0); + } +} + +static int +cmpd(const void *p1, const void *p2) +{ + const struct lat_elt *d1 = p1, *d2 = p2; + + return (d1->lat) - (d2->lat); +} + +double +rspamd_http_calculate_mean(struct lat_elt *lats, double *std) +{ + guint i, cnt, checked = 0; + gdouble mean = 0., dev = 0.; + + cnt = store_latencies * pconns; + qsort(lats, cnt, sizeof(*lats), cmpd); + + for (i = 0; i < cnt; i++) { + if (lats[i].checked) { + mean += lats[i].lat; + checked++; + } + } + + g_assert(checked > 0); + mean /= checked; + + for (i = 0; i < cnt; i++) { + if (lats[i].checked) { + dev += pow((lats[i].lat - mean), 2); + } + } + + dev /= checked; + + *std = sqrt(dev); + return mean; +} + +static void +rspamd_http_start_workers(pid_t *sfd) +{ + guint i; + for (i = 0; i < nworkers; i++) { + sfd[i] = fork(); + g_assert(sfd[i] != -1); + + if (sfd[i] == 0) { + gperf_profiler_init(NULL, "http-bench"); + rspamd_worker_func(&latencies[i * pconns * store_latencies], + &conns_done[i]); + gperf_profiler_stop(); + exit(EXIT_SUCCESS); + } + + workers_left++; + } +} + +static void +rspamd_http_stop_workers(pid_t *sfd) +{ + guint i; + gint res; + + for (i = 0; i < nworkers; i++) { + kill(sfd[i], SIGTERM); + wait(&res); + } +} + +static void +rspamd_http_bench_term(int fd, short what, void *arg) +{ + pid_t *sfd = arg; + + rspamd_http_stop_workers(sfd); + event_loopexit(NULL); +} + +static void +rspamd_http_bench_cld(int fd, short what, void *arg) +{ + gint res; + + while (waitpid(-1, &res, WNOHANG) > 0) { + if (--workers_left == 0) { + event_loopexit(NULL); + } + } +} + + +int main(int argc, char **argv) +{ + GOptionContext *context; + GError *error = NULL; + pid_t *sfd; + struct ev_loop *ev_base; + rspamd_mempool_t *pool = rspamd_mempool_new(8192, "http-bench"); + struct event term_ev, int_ev, cld_ev; + guint64 total_done; + FILE *lat_file; + gdouble mean, std; + guint i; + + rspamd_init_libs(); + + context = g_option_context_new( + "rspamd-http-bench - test server for benchmarks"); + g_option_context_set_summary(context, + "Summary:\n Rspamd test HTTP benchmark " RVERSION + "\n Release id: " RID); + g_option_context_add_main_entries(context, entries, NULL); + + if (!g_option_context_parse(context, &argc, &argv, &error)) { + rspamd_fprintf(stderr, "option parsing failed: %s\n", error->message); + g_error_free(error); + exit(EXIT_FAILURE); + } + + rspamd_parse_inet_address(&addr, host, 0); + g_assert(addr != NULL); + rspamd_inet_address_set_port(addr, port); + + latencies = rspamd_mempool_alloc_shared(pool, + nworkers * pconns * store_latencies * sizeof(*latencies)); + sfd = g_malloc(sizeof(*sfd) * nworkers); + conns_done = rspamd_mempool_alloc_shared(pool, sizeof(guint32) * nworkers); + memset(conns_done, 0, sizeof(guint32) * nworkers); + + rspamd_http_start_workers(sfd); + + ev_base = event_init(); + + event_set(&term_ev, SIGTERM, EV_SIGNAL, rspamd_http_bench_term, sfd); + event_base_set(ev_base, &term_ev); + event_add(&term_ev, NULL); + event_set(&int_ev, SIGINT, EV_SIGNAL, rspamd_http_bench_term, sfd); + event_base_set(ev_base, &int_ev); + event_add(&int_ev, NULL); + event_set(&cld_ev, SIGCHLD, EV_SIGNAL | EV_PERSIST, + rspamd_http_bench_cld, NULL); + event_base_set(ev_base, &cld_ev); + event_add(&cld_ev, NULL); + + event_base_loop(ev_base, 0); + + total_done = 0; + for (i = 0; i < nworkers; i++) { + total_done += conns_done[i]; + } + + mean = rspamd_http_calculate_mean(latencies, &std); + + if (!csv_output) { + rspamd_printf( + "Made %L connections of size %d in %.6fs, %.6f cps, %.6f MB/sec\n", + total_done, + file_size, + test_time, + total_done / test_time, + total_done * file_size / test_time / (1024.0 * 1024.0)); + rspamd_printf("Latency: %.6f ms mean, %.6f dev\n", + mean * 1000.0, std * 1000.0); + } + else { + /* size,connections,time,mean,stddev,conns,workers */ + rspamd_printf("%ud,%L,%.1f,%.6f,%.6f,%ud,%ud\n", + file_size, + total_done, + test_time, + mean * 1000.0, + std * 1000.0, + pconns, + nworkers); + } + + if (latencies_file) { + lat_file = fopen(latencies_file, "w"); + + if (lat_file) { + for (i = 0; i < store_latencies * pconns; i++) { + if (latencies[i].checked) { + rspamd_fprintf(lat_file, "%.6f\n", latencies[i].lat); + } + } + + fclose(lat_file); + } + } + + rspamd_mempool_delete(pool); + + return 0; +} diff --git a/utils/rspamd_http_server.c b/utils/rspamd_http_server.c new file mode 100644 index 0000000..ecd1d38 --- /dev/null +++ b/utils/rspamd_http_server.c @@ -0,0 +1,300 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "util.h" +#include "libutil/fstring.h" +#include "libutil/http.h" +#include "libutil/http_private.h" +#include "ottery.h" +#include "cryptobox.h" +#include "keypair.h" +#include "unix-std.h" +#include <math.h> + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +static guint port = 43000; +static guint cache_size = 10; +static guint nworkers = 1; +static gboolean openssl_mode = FALSE; +static GHashTable *maps = NULL; +static gchar *key = NULL; +static struct rspamd_keypair_cache *c; +static struct rspamd_cryptobox_keypair *server_key; +static struct timeval io_tv = { + .tv_sec = 20, + .tv_usec = 0}; + +static GOptionEntry entries[] = { + {"port", 'p', 0, G_OPTION_ARG_INT, &port, + "Port number (default: 43000)", NULL}, + {"cache", 'c', 0, G_OPTION_ARG_INT, &cache_size, + "Keys cache size (default: 10)", NULL}, + {"workers", 'n', 0, G_OPTION_ARG_INT, &nworkers, + "Number of workers to start (default: 1)", NULL}, + {"openssl", 'o', 0, G_OPTION_ARG_NONE, &openssl_mode, + "Use openssl crypto", NULL}, + {"key", 'k', 0, G_OPTION_ARG_STRING, &key, + "Use static keypair instead of new one (base32 encoded sk || pk)", NULL}, + {NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL}}; + +struct rspamd_http_server_session { + struct rspamd_http_connection *conn; + struct ev_loop *ev_base; + guint req_size; + gboolean reply; + gint fd; +}; + +static void +rspamd_server_error(struct rspamd_http_connection *conn, + GError *err) +{ + struct rspamd_http_server_session *session = conn->ud; + + rspamd_fprintf(stderr, "http error occurred: %s\n", err->message); + rspamd_http_connection_unref(conn); + close(session->fd); + g_slice_free1(sizeof(*session), session); +} + +static int +rspamd_server_finish(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg) +{ + struct rspamd_http_server_session *session = conn->ud; + struct rspamd_http_message *reply; + gulong size; + const gchar *url_str; + guint url_len; + rspamd_fstring_t *body; + + if (!session->reply) { + session->reply = TRUE; + reply = rspamd_http_new_message(HTTP_RESPONSE); + url_str = msg->url->str; + url_len = msg->url->len; + + if (url_str[0] == '/') { + url_str++; + url_len--; + } + + if (rspamd_strtoul(url_str, url_len, &size)) { + session->req_size = size; + + reply->code = 200; + reply->status = rspamd_fstring_new_init("OK", 2); + body = rspamd_fstring_sized_new(size); + body->len = size; + memset(body->str, 0, size); + rspamd_http_message_set_body_from_fstring_steal(msg, body); + } + else { + reply->code = 404; + reply->status = rspamd_fstring_new_init("Not found", 9); + } + + rspamd_http_connection_reset(conn); + rspamd_http_connection_write_message(conn, reply, NULL, + "application/octet-stream", session, session->fd, + &io_tv, session->ev_base); + } + else { + /* Destroy session */ + rspamd_http_connection_unref(conn); + close(session->fd); + g_slice_free1(sizeof(*session), session); + } + + return 0; +} + +static void +rspamd_server_accept(gint fd, short what, void *arg) +{ + struct ev_loop *ev_base = arg; + struct rspamd_http_server_session *session; + rspamd_inet_addr_t *addr; + gint nfd; + + do { + if ((nfd = + rspamd_accept_from_socket(fd, &addr, NULL)) == -1) { + rspamd_fprintf(stderr, "accept failed: %s", strerror(errno)); + return; + } + /* Check for EAGAIN */ + if (nfd == 0) { + rspamd_inet_address_free(addr); + return; + } + + rspamd_inet_address_free(addr); + session = g_slice_alloc(sizeof(*session)); + session->conn = rspamd_http_connection_new(NULL, + rspamd_server_error, + rspamd_server_finish, + 0, + RSPAMD_HTTP_SERVER, + c, + NULL); + rspamd_http_connection_set_key(session->conn, server_key); + rspamd_http_connection_read_message(session->conn, + session, + nfd, + &io_tv, + ev_base); + session->reply = FALSE; + session->fd = nfd; + session->ev_base = ev_base; + } while (nfd > 0); +} + +static void +rspamd_http_term_handler(gint fd, short what, void *arg) +{ + struct ev_loop *ev_base = arg; + struct timeval tv = {0, 0}; + + event_base_loopexit(ev_base, &tv); +} + +static void +rspamd_http_server_func(gint fd, rspamd_inet_addr_t *addr) +{ + struct ev_loop *ev_base = event_init(); + struct event accept_ev, term_ev; + + event_set(&accept_ev, fd, EV_READ | EV_PERSIST, rspamd_server_accept, ev_base); + event_base_set(ev_base, &accept_ev); + event_add(&accept_ev, NULL); + + evsignal_set(&term_ev, SIGTERM, rspamd_http_term_handler, ev_base); + event_base_set(ev_base, &term_ev); + event_add(&term_ev, NULL); + + event_base_loop(ev_base, 0); +} + +static void +rspamd_http_start_servers(pid_t *sfd, rspamd_inet_addr_t *addr) +{ + guint i; + gint fd; + + fd = rspamd_inet_address_listen(addr, SOCK_STREAM, TRUE); + g_assert(fd != -1); + + for (i = 0; i < nworkers; i++) { + sfd[i] = fork(); + g_assert(sfd[i] != -1); + + if (sfd[i] == 0) { + rspamd_http_server_func(fd, addr); + exit(EXIT_SUCCESS); + } + } + + close(fd); +} + +static void +rspamd_http_stop_servers(pid_t *sfd) +{ + guint i; + gint res; + + for (i = 0; i < nworkers; i++) { + kill(sfd[i], SIGTERM); + wait(&res); + } +} + +static void +rspamd_http_server_term(int fd, short what, void *arg) +{ + pid_t *sfd = arg; + + rspamd_http_stop_servers(sfd); + event_loopexit(NULL); +} + +int main(int argc, gchar **argv) +{ + GOptionContext *context; + GError *error = NULL; + struct ev_loop *ev_base; + GString *b32_key; + pid_t *sfd; + rspamd_inet_addr_t *addr; + struct event term_ev, int_ev; + struct in_addr ina = {INADDR_ANY}; + + rspamd_init_libs(); + + context = g_option_context_new( + "rspamd-http-server - test server for benchmarks"); + g_option_context_set_summary(context, + "Summary:\n Rspamd test HTTP server " RVERSION + "\n Release id: " RID); + g_option_context_add_main_entries(context, entries, NULL); + + if (!g_option_context_parse(context, &argc, &argv, &error)) { + rspamd_fprintf(stderr, "option parsing failed: %s\n", error->message); + g_error_free(error); + exit(EXIT_FAILURE); + } + + maps = g_hash_table_new(g_int_hash, g_int_equal); + + if (key == NULL) { + server_key = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX, + openssl_mode ? RSPAMD_CRYPTOBOX_MODE_NIST : RSPAMD_CRYPTOBOX_MODE_25519); + b32_key = rspamd_keypair_print(server_key, + RSPAMD_KEYPAIR_PUBKEY | RSPAMD_KEYPAIR_BASE32); + rspamd_printf("key: %v\n", b32_key); + } + else { + /* TODO: add key loading */ + } + + if (cache_size > 0) { + c = rspamd_keypair_cache_new(cache_size); + } + + sfd = g_alloca(sizeof(*sfd) * nworkers); + addr = rspamd_inet_address_new(AF_INET, &ina); + rspamd_inet_address_set_port(addr, port); + rspamd_http_start_servers(sfd, addr); + + /* Just wait for workers */ + ev_base = event_init(); + + event_set(&term_ev, SIGTERM, EV_SIGNAL, rspamd_http_server_term, sfd); + event_base_set(ev_base, &term_ev); + event_add(&term_ev, NULL); + event_set(&int_ev, SIGINT, EV_SIGNAL, rspamd_http_server_term, sfd); + event_base_set(ev_base, &int_ev); + event_add(&int_ev, NULL); + + event_base_loop(ev_base, 0); + + return 0; +} diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl new file mode 100755 index 0000000..9c5f2ac --- /dev/null +++ b/utils/rspamd_stats.pl @@ -0,0 +1,1018 @@ +#!/usr/bin/env perl + +use 5.010; +use Data::Dumper; +use Getopt::Long; +use Pod::Usage; +use Time::Local; +use IO::Handle; +use warnings; +use strict; + +my @symbols_search; +my @symbols_exclude; +my @symbols_bidirectional; +my @symbols_groups; +my @symbols_ignored; +my %symbols_mult; +my %groups; +my $reject_score = 15.0; +my $junk_score = 6.0; +my $diff_alpha = 0.1; +my $correlations = 0; +my $nrelated = 10; +my $log_file = ""; +my $search_pattern = ""; +my $startTime = ""; +my $endTime; +my $num_logs; +my $exclude_logs = 0; +my $man = 0; +my $json = 0; +my $help = 0; + +# Associate file extensions with decompressors +my %decompressor = ( + 'bz2' => 'bzip2 -cd', + 'gz' => 'gzip -cd', + 'xz' => 'xz -cd', + 'zst' => 'zstd -cd', +); + +GetOptions( + "reject-score|r=f" => \$reject_score, + "junk-score|j=f" => \$junk_score, + "symbol|s=s@" => \@symbols_search, + "symbol-bidir|S=s@" => \@symbols_bidirectional, + "exclude|X=s@" => \@symbols_exclude, + "ignore=s@" => \@symbols_ignored, + "group|g=s@" => \@symbols_groups, + "log|l=s" => \$log_file, + "mult=s" => \%symbols_mult, + "alpha-score|alpha|a=f" => \$diff_alpha, + "correlations|c" => \$correlations, + "nrelated=i" => \$nrelated, + "search-pattern=s" => \$search_pattern, + "start=s" => \$startTime, + "end=s" => \$endTime, + "num-logs|n=i" => \$num_logs, + "exclude-logs|x=i" => \$exclude_logs, + "json|j" => \$json, + "help|?" => \$help, + "man" => \$man +) or pod2usage(2); + +pod2usage(1) if $help; +pod2usage( -exitval => 0, -verbose => 2 ) if $man; + +# Global vars +my $total = 0; +my $total_spam = 0; +my $total_junk = 0; +my $junk_symbols = 0; +my $spam_symbols = 0; +my $ham_symbols = 0; +my $ham_spam_change = 0; +my $ham_junk_change = 0; +my %sym_res; +my $rspamd_log; +my $enabled = 0; +my $log_file_num = 1; +my $spinner_update_time = 0; + +my %action; +my %timeStamp; +my %scanTime = ( + max => 0, + total => 0, +); +my %bidir_match; + +foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) } + +# Convert bidirectional symbols +foreach my $s (@symbols_bidirectional) { + $bidir_match{$s} = { + spam => "${s}_SPAM", + ham => "${s}_HAM", + }; + push @symbols_search, $s unless grep /^$s$/, @symbols_search; +} + +# Deal with groups +my $group_id = 0; +foreach my $g (@symbols_groups) { + my @symbols = split /,/, $g; + my $group_name = "group$group_id"; + + foreach my $s (@symbols) { + $groups{$s} = $group_name; + push @symbols_search, $s unless grep /^$s$/, @symbols_search; + } +} + +@symbols_search = '.*' + unless @symbols_search; + +if ( $log_file eq '-' || $log_file eq '' ) { + $rspamd_log = \*STDIN; + &ProcessLog(); +} +elsif ( -d "$log_file" ) { + my $log_dir = "$log_file"; + + my @logs = &GetLogfilesList($log_dir); + + # Process logs + foreach (@logs) { + my $ext = (/[^.]+\.?([^.]*?)$/)[0]; + my $dc = $decompressor{$ext} || 'cat'; + + open( $rspamd_log, "-|", "$dc $log_dir/$_" ) + or die "cannot execute $dc $log_dir/$_ : $!"; + + printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, + $_; + $spinner_update_time = 0; # Force spinner update + &spinner; + + &ProcessLog; + + close($rspamd_log) + or warn "cannot close $dc $log_dir/$_: $!"; + } + print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up +} +else { + my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0]; + my $dc = $decompressor{$ext} || 'cat'; + open( $rspamd_log, "-|", "$dc $log_file" ) + or die "cannot execute $dc $log_file : $!"; + $spinner_update_time = 0; # Force spinner update + &spinner; + &ProcessLog(); +} + +my $total_ham = $total - ( $total_spam + $total_junk ); + +if ($json) { + print "{"; + &Summary(); + print '"symbols":{'; + &SymbolsStat(); + print "}}\n"; +} +else { + &SymbolsStat(); + &Summary(); +} + +exit; + +sub IsIgnored { + my ($sym) = @_; + + foreach my $ex (@symbols_ignored) { + if ( $sym =~ /^$ex$/ ) { + return 1; + } + } + + return 0; +} + +sub GenRelated { + my ( $htb, $target_sym ) = @_; + + my @result; + my $i = 0; + foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) { + if ( $sym ne $target_sym ) { + my @elt = ( $sym, $htb->{$sym} ); + push @result, \@elt; + $i++; + } + + last if $i > $nrelated; + } + + return \@result; +} + +sub StringifyRelated { + my ( $ar, $total ) = @_; + return + join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) ); +} + +sub SymbolsStat { + if ( $total > 0 ) { + my $has_comma = 0; + while ( my ( $s, $r ) = each(%sym_res) ) { + if ( $r->{hits} > 0 ) { + my $th = $r->{hits}; + my $sh = $r->{spam_hits}; + my $jh = $r->{junk_hits}; + my $hh = $r->{hits} - $sh - $jh; + my ( $htp, $stp, $jtp ); + $htp = $hh * 100.0 / $total_ham if $total_ham != 0; + $stp = $sh * 100.0 / $total_spam if $total_spam != 0; + $jtp = $jh * 100.0 / $total_junk if $total_junk != 0; + + if ($json) { + if ($has_comma) { + print ","; + } + else { + $has_comma = 1; + } + print "\"$s\":{"; + JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" ); + print ","; + JsonObjectElt( "hits", $th, "%d" ); + print ","; + JsonObjectElt( "hits_percentage", $th / $total, "%.4f" ); + print ","; + JsonObjectElt( "spam_hits", $sh, "%d" ); + print ","; + JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" ); + print ","; + JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" ); + print ","; + JsonObjectElt( "ham_hits", $hh, "%d" ); + print ","; + JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" ); + print ","; + JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" ); + print ","; + JsonObjectElt( "junk_hits", $jh, "%d" ); + print ","; + JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" ); + print ","; + JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" ); + } + else { + printf "%s avg. weight %.3f, hits %d(%.3f%%): + Ham %7.3f%%, %6d/%-6d (%7.3f%%) + Spam %7.3f%%, %6d/%-6d (%7.3f%%) + Junk %7.3f%%, %6d/%-6d (%7.3f%%) +", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ), + ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ), + ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ), + ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 ); + } + my ( $schp, $jchp ); + $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam; + $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk; + + if ( $r->{weight} != 0 ) { + if ( !$json ) { + if ( $r->{weight} > 0 ) { + printf " +Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%) +Spam changes / total spam hits: %6d/%-6d (%7.3f%%) +Junk changes (ham -> junk): %6d/%-6d (%7.3f%%) +Junk changes / total junk hits: %6d/%-6d (%7.3f%%) +", + $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ), + $r->{spam_change}, $total_spam, ( $schp or 0 ), + $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ), + $r->{junk_change}, $total_junk, ( $jchp or 0 ); + } + else { + printf " +Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%) +Spam changes / total spam hits : %6d/%-6d (%7.3f%%) +Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%) +Junk changes / total junk hits : %6d/%-6d (%7.3f%%) +", + $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ), + $r->{spam_change}, $total_spam, ( $schp or 0 ), + $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ), + $r->{junk_change}, $total_junk, ( $jchp or 0 ); + } + } + else { + print ","; + JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" ); + print ","; + JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" ); + } + } + + if ($correlations) { + + my $spam_related = GenRelated( $r->{symbols_met_spam}, $s ); + my $junk_related = GenRelated( $r->{symbols_met_junk}, $s ); + my $ham_related = GenRelated( $r->{symbols_met_ham}, $s ); + + if ( !$json ) { + print "Correlations report:\n"; + + while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) { + my $corr_prob = $r->{'hits'} / $total; + my $merged_hits = 0; + if ( $r->{symbols_met_spam}->{$cs} ) { + $merged_hits += $r->{symbols_met_spam}->{$cs}; + } + if ( $r->{symbols_met_junk}->{$cs} ) { + $merged_hits += $r->{symbols_met_junk}->{$cs}; + } + if ( $r->{symbols_met_ham}->{$cs} ) { + $merged_hits += $r->{symbols_met_ham}->{$cs}; + } + + if ( $merged_hits > 0 ) { + printf "Probability of %s when %s fires: %.3f\n", $cs, $s, + ( ( $merged_hits / $total ) / $corr_prob ); + } + } + + print "Related symbols report:\n"; + printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} ); + printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} ); + printf "Top related in ham:\n %s\n", + StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} ); + } + else { + print ","; + print "\"correllations\":{"; + + my $has_comma_ = 0; + while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) { + if ($has_comma_) { + print ","; + } + else { + $has_comma_ = 1; + } + my $corr_prob = $hits / $total; + my $sym_prob = $r->{hits} / $total; + JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" ); + } + + print "}"; + } + } + + print "}" if $json; + } + else { + print "Symbol $s has not been met\n" if !$json; + } + + print '-' x 80 . "\n" if !$json; + } + } +} + +sub Summary() { + if ( !$json ) { + print " +=== Summary ", '=' x 68, " +Messages scanned: $total"; + printf " [ %s / %s ] +", $timeStamp{'start'}, $timeStamp{'end'} + if defined $timeStamp{'start'}; + say ''; + printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action; + say ''; + printf "scan time min/avg/max = %.2f/%.2f/%.2f s +", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000 + if exists $scanTime{'min'}; + say '=' x 80; + } + else { + JsonObjectElt( "total", $total, "%d" ); + print ","; + + if ( defined $timeStamp{'start'} ) { + JsonObjectElt( "start", $timeStamp{'start'} ); + print ","; + } + + if ( defined $timeStamp{'end'} ) { + JsonObjectElt( "end", $timeStamp{'end'} ); + print ","; + } + + print "\"actions\":{"; + + my $has_comma = 0; + foreach my $a ( sort keys %action ) { + if ($has_comma) { + print ","; + } + else { + $has_comma = 1; + } + JsonObjectElt( $a, $action{$a}, "%d" ); + } + print "},"; + } +} + +sub ProcessRelated { + my ( $symbols, $target, $source ) = @_; + + foreach my $s ( @{$symbols} ) { + $s =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; + + if ( $groups{$sym_name} ) { + $sym_name = $groups{$sym_name}; + } + + next if ( $source eq $sym_name ); + + next if IsIgnored($sym_name); + + if ($2) { + $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0); + + if ( abs($sym_score) < $diff_alpha ) { + next; + } + + my $bm = $bidir_match{$sym_name}; + if ($bm) { + if ( $sym_score >= 0 ) { + $sym_name = $bm->{'spam'}; + } + else { + $sym_name = $bm->{'ham'}; + } + } + } + + if ( exists( $target->{$sym_name} ) ) { + $target->{$sym_name}++; + } + else { + $target->{$sym_name} = 1; + } + } +} + +sub ProcessLog { + my ( $ts_format, @line ) = &log_time_format($rspamd_log); + + while () { + last if eof $rspamd_log; + $_ = (@line) ? shift @line : <$rspamd_log>; + + if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) { + $enabled = 1; + } + + next if !$enabled; + + if (/^.*rspamd_task_write_log.*$/) { + &spinner; + my $ts; + if ( $ts_format eq 'syslog' ) { + $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] ); + } + elsif ( $ts_format eq 'syslog5424' ) { + /^([0-9-]+)T([0-9:]+)/; + $ts = "$1 $2"; + } + else { + $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ]; + } + + next if ( $ts lt $startTime ); + next if ( defined $endTime && $ts gt $endTime ); + + if ( $_ !~ + /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms/ ) + { + #print "BAD: $_\n"; + next; + } + + my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4; + my $scan_time = $5; + my $act = $1; + my $score = $2 * 1.0; + my $skip = 0; + + foreach my $ex (@symbols_exclude) { + my @found = grep { /^$ex/ } @symbols; + + if ( scalar(@found) > 0 ) { + $skip = 1; + last; + } + } + + next if ( $skip != 0 ); + + if ( defined( $timeStamp{'end'} ) ) { + $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} ); + } + else { + $timeStamp{'end'} = $ts; + } + + if ( defined( $timeStamp{'start'} ) ) { + $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} ); + } + else { + $timeStamp{'start'} = $ts; + } + + $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time ); + $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time ); + $scanTime{'total'} += $scan_time; + + $action{$act}++; + $total++; + + if ( $score >= $reject_score ) { + $total_spam++; + } + elsif ( $score >= $junk_score ) { + $total_junk++; + } + + my @sym_names; + + foreach my $s (@symbols_search) { + my @selected = grep /$s/, @symbols; + + if ( scalar(@selected) > 0 ) { + + foreach my $sym (@selected) { + $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; + my $orig_name = $sym_name; + + if ($2) { + $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0); + + if ( abs($sym_score) < $diff_alpha ) { + next; + } + + my $bm = $bidir_match{$sym_name}; + if ($bm) { + if ( $sym_score >= 0 ) { + $sym_name = $bm->{'spam'}; + } + else { + $sym_name = $bm->{'ham'}; + } + } + } + + next if $orig_name !~ /^$s/; + + if ( $groups{$s} ) { + + # Replace with group + $sym_name = $groups{$s}; + } + + push @sym_names, $sym_name; + + if ( !$sym_res{$sym_name} ) { + $sym_res{$sym_name} = { + hits => 0, + spam_hits => 0, + junk_hits => 0, + spam_change => 0, + junk_change => 0, + weight => 0, + corr => {}, + symbols_met_spam => {}, + symbols_met_ham => {}, + symbols_met_junk => {}, + }; + } + + my $r = $sym_res{$sym_name}; + + $r->{hits}++; + $r->{weight} += $sym_score; + my $is_spam = 0; + my $is_junk = 0; + + if ( $score >= $reject_score ) { + $is_spam = 1; + $r->{spam_hits}++; + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name ); + } + } + elsif ( $score >= $junk_score ) { + $is_junk = 1; + $r->{junk_hits}++; + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name ); + } + } + else { + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name ); + } + } + + if ( $sym_score != 0 ) { + my $score_without = $score - $sym_score; + + if ( $sym_score > 0 ) { + if ( $is_spam && $score_without < $reject_score ) { + $r->{spam_change}++; + } + if ( $is_junk && $score_without < $junk_score ) { + $r->{junk_change}++; + } + } + else { + if ( !$is_spam && $score_without >= $reject_score ) { + $r->{spam_change}++; + } + if ( !$is_junk && $score_without >= $junk_score ) { + $r->{junk_change}++; + } + } + } + } # End foreach symbols selected + } + } + + if ($correlations) { + foreach my $sym (@sym_names) { + next if IsIgnored($sym); + my $r = $sym_res{$sym}; + + foreach my $corr_sym (@sym_names) { + if ( $corr_sym ne $sym ) { + if ( $r->{'corr'}->{$corr_sym} ) { + $r->{'corr'}->{$corr_sym}++; + } + else { + $r->{'corr'}->{$corr_sym} = 1; + } + } + } + } # End of correlations check + } + } + } +} + +sub JsonObjectElt() { + my ( $k, $v ) = @_; + my $f = defined $_[2] ? $_[2] : '%s'; + + if ( $f eq "%s" ) { + $f = "\"%s\""; + } + + printf "\"%s\":$f", $k, $v; +} + +sub GetLogfilesList { + my ($dir) = @_; + opendir( my $fh, $dir ) or die $!; + + my $pattern = join( '|', keys %decompressor ); + my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/; + + # Add unnumbered logs first + my @logs = + grep { -f "$dir/$_" && !/$re/ } readdir($fh); + + # Add numbered logs + rewinddir($fh); + push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir($fh) ) ) ); + + closedir($fh); + + # Select required logs and revers their order + @logs = + reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs ); + + # Loop through array printing out filenames + print { interactive(*STDERR) } "\nLog files to process:\n"; + foreach my $file (@logs) { + print { interactive(*STDERR) } " $file\n"; + } + print { interactive(*STDERR) } "\n"; + + return @logs; +} + +sub log_time_format { + my $fh = shift; + my ( $format, $line ); + while (<$fh>) { + $line = $_; + + # 2017-08-08 00:00:01 #66984( + # 2017-08-08 00:00:01.001 #66984( + if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3,5})? #\d+\(/) { + $format = 'rspamd'; + last; + } + + # Aug 8 00:02:50 #66986( + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) { + $format = 'syslog'; + last; + } + + # Aug 8 00:02:50 hostname rspamd[66986] + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) { + $format = 'syslog'; + last; + } + + # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968] + elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) { + $format = 'syslog5424'; + last; + } + + # Skip newsyslog messages + # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) { + next; + } + + # Skip journalctl messages + # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. -- + elsif ( +/^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/ + ) + { + next; + } + else { + print "Unknown log format\n"; + exit 1; + } + } + return ( $format, $line ); +} + +sub normalized_time { + return + if !defined( $_ = shift ); + + /^\d\d(?::\d\d){0,2}$/ + ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_ + : $_; +} + +sub numeric { + $a =~ /\.(\d+)\./; + my $a_num = $1; + $b =~ /\.(\d+)\./; + my $b_num = $1; + + $a_num <=> $b_num; +} + +sub spinner { + my @spinner = qw{/ - \ |}; + return + if ( ( time - $spinner_update_time ) < 1 ); + $spinner_update_time = time; + printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ]; + select()->flush(); +} + +# Convert syslog timestamp to "ISO 8601 like" format +# using current year as syslog does not record the year (nor the timezone) +# or the last year if the guessed time is in the future. +sub syslog2iso { + my %month_map; + @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11; + + my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x; + my $epoch = + timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] ); + sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t; +} + +### Imported from IO::Interactive 1.022 Perl module +sub is_interactive { + ## no critic (ProhibitInteractiveTest) + + my ($out_handle) = ( @_, select ); # Default to default output handle + + # Not interactive if output is not to terminal... + return 0 if not -t $out_handle; + + # If *ARGV is opened, we're interactive if... + if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to + + # ...it's currently opened to the magic '-' file + return -t *STDIN if defined $ARGV && $ARGV eq '-'; + + # ...it's at end-of-file and the next file is the magic '-' file + return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV; + + # ...it's directly attached to the terminal + return -t *ARGV; + } + + # If *ARGV isn't opened, it will be interactive if *STDIN is attached + # to a terminal. + else { + return -t *STDIN; + } +} + +### Imported from IO::Interactive 1.022 Perl module +local ( *DEV_NULL, *DEV_NULL2 ); +my $dev_null; + +BEGIN { + pipe *DEV_NULL, *DEV_NULL2 + or die "Internal error: can't create null filehandle"; + $dev_null = \*DEV_NULL; +} + +### Imported from IO::Interactive 1.022 Perl module +sub interactive { + my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT + return &is_interactive ? $out_handle : $dev_null; +} + +__END__ + +=head1 NAME + +rspamd_stats - analyze Rspamd rules by parsing log files + +=head1 SYNOPSIS + +rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file] + + Options: + --log=file log file or directory to read (stdin by default) + --reject-score=score set reject threshold (15 by default) + --junk-score=score set junk score (6.0 by default) + --symbol=sym check specified symbol (perl regexps, '.*' by default) + --alpha-score=score set ignore score for symbols (0.1 by default) + --correlations enable correlations report + --nrelated=integer show that amount of related symbols (10 by default) + --search-pattern do not process input unless the desired pattern is found + --start starting time (oldest) for log parsing + --end ending time (newest) for log parsing + --num-logs=integer number of recent logfiles to analyze (all files in the directory by default) + --exclude-logs=integer number of latest logs to exclude (0 by default) + --json print json output instead of human readable + --help brief help message + --mult=sym=number multiply symbol score + --man full documentation + +=head1 OPTIONS + +=over 8 + +=item B<--log> + +Specifies log file or directory to read data from. If a directory is specified B<rspamd_stats> analyses files in the +directory including known compressed file types. Number of log files can be limited using B<--num-logs> and +B<--exclude-logs> options. This assumes that files in the log directory have B<newsyslog(8)>- or B<logrotate(8)>-like +name format with numeric indexes. Files without indexes (generally it is merely one file) are considered the most +recent and files with lower indexes are considered newer. + +=item B<--reject-score> + +Specifies the reject (spam) threshold. + +=item B<--junk-score> + +Specifies the junk (add header or rewrite subject) threshold. + +=item B<--alpha-score> + +Specifies the minimum score for a symbol to be considered by this script. + +=item B<--symbol> + +Add symbol or pattern (pcre format) to analyze. + +=item B<--num-logs> + +If set, limits number of analyzed logfiles in the directory to the specified value. + +=item B<--exclude-logs> + +Number of latest logs to exclude (0 by default). + +=item B<--correlations> + +Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations +between symbols. + +=item B<--search-pattern> + +Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position. + +=item B<--exclude> + +Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple +symbols. + +=item B<--start> + +Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used +with B<--end> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you +supply the time. + +=item B<--end> + +Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used +with B<--start> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you +supply the time. + +=item B<--mult=symbol=number> + +Multiplies score for the named symbol by the provided multiplier. + +=item B<--help> + +Print a brief help message and exits. + +=item B<--man> + +Prints the manual page and exits. + +=back + +=head1 DESCRIPTION + +B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols: + + Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%) + Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%) + Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%) + Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%) + Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102% + Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540% + +Where there are the following attributes: + +=over 4 + +=item * + +B<Weight>: average score for a symbols + +=item * + +B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages + +=item * + +B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right): + +=over 4 + +=item 1. + +B<total symbol hits>: number of messages that has this symbol and are B<HAM> + +=item 2. + +B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count + +=item 3. + +B<total ham hits>: overall number of B<HAM> messages + +=item 4. + +B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total +number of B<HAM> messages. + +=back + +=item * + +B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class. + +=item * + +B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class. + +=item * + +B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight. + +=item * + +B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight. + +=back + +=cut diff --git a/utils/sa_trivial_convert.lua b/utils/sa_trivial_convert.lua new file mode 100644 index 0000000..2ea53be --- /dev/null +++ b/utils/sa_trivial_convert.lua @@ -0,0 +1,443 @@ +local fun = require "fun" +local rspamd_logger = require "rspamd_logger" +local util = require "rspamd_util" +local lua_util = require "lua_util" +local rspamd_regexp = require "rspamd_regexp" +local ucl = require "ucl" + +local complicated = {} +local rules = {} +local scores = {} + +local function words_to_re(words, start) + return table.concat(fun.totable(fun.drop_n(start, words)), " "); +end + +local function split(str, delim) + local result = {} + + if not delim then + delim = '[^%s]+' + end + + for token in string.gmatch(str, delim) do + table.insert(result, token) + end + + return result +end + +local function handle_header_def(hline, cur_rule) + --Now check for modifiers inside header's name + local hdrs = split(hline, '[^|]+') + local hdr_params = {} + local cur_param = {} + -- Check if an re is an ordinary re + local ordinary = true + + for _,h in ipairs(hdrs) do + if h == 'ALL' or h == 'ALL:raw' then + ordinary = false + else + local args = split(h, '[^:]+') + cur_param['strong'] = false + cur_param['raw'] = false + cur_param['header'] = args[1] + + if args[2] then + -- We have some ops that are required for the header, so it's not ordinary + ordinary = false + end + + fun.each(function(func) + if func == 'addr' then + cur_param['function'] = function(str) + local addr_parsed = util.parse_mail_address(str) + local ret = {} + if addr_parsed then + for _,elt in ipairs(addr_parsed) do + if elt['addr'] then + table.insert(ret, elt['addr']) + end + end + end + + return ret + end + elseif func == 'name' then + cur_param['function'] = function(str) + local addr_parsed = util.parse_mail_address(str) + local ret = {} + if addr_parsed then + for _,elt in ipairs(addr_parsed) do + if elt['name'] then + table.insert(ret, elt['name']) + end + end + end + + return ret + end + elseif func == 'raw' then + cur_param['raw'] = true + elseif func == 'case' then + cur_param['strong'] = true + else + rspamd_logger.warnx(rspamd_config, 'Function %1 is not supported in %2', + func, cur_rule['symbol']) + end + end, fun.tail(args)) + + -- Some header rules require splitting to check of multiple headers + if cur_param['header'] == 'MESSAGEID' then + -- Special case for spamassassin + ordinary = false + elseif cur_param['header'] == 'ToCc' then + ordinary = false + else + table.insert(hdr_params, cur_param) + end + end + + cur_rule['ordinary'] = ordinary and #hdr_params <= 1 + cur_rule['header'] = hdr_params + end +end + +local function process_sa_conf(f) + local cur_rule = {} + local valid_rule = false + + local function insert_cur_rule() + if not rules[cur_rule.type] then + rules[cur_rule.type] = {} + end + + local target = rules[cur_rule.type] + + if cur_rule.type == 'header' then + if not cur_rule.header[1].header then + rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule) + return + end + if not target[cur_rule.header[1].header] then + target[cur_rule.header[1].header] = {} + end + target = target[cur_rule.header[1].header] + end + + if not cur_rule['symbol'] then + rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule) + return + end + target[cur_rule['symbol']] = cur_rule + cur_rule = {} + valid_rule = false + end + + local function parse_score(words) + if #words == 3 then + -- score rule <x> + return tonumber(words[3]) + elseif #words == 6 then + -- score rule <x1> <x2> <x3> <x4> + -- we assume here that bayes and network are enabled and select <x4> + return tonumber(words[6]) + else + rspamd_logger.errx(rspamd_config, 'invalid score for %1', words[2]) + end + + return 0 + end + + local skip_to_endif = false + local if_nested = 0 + for l in f:lines() do + (function () + l = lua_util.rspamd_str_trim(l) + -- Replace bla=~/re/ with bla =~ /re/ (#2372) + l = l:gsub('([^%s])%s*([=!]~)%s*([^%s])', '%1 %2 %3') + + if string.len(l) == 0 or string.sub(l, 1, 1) == '#' then + return + end + + -- Unbalanced if/endif + if if_nested < 0 then if_nested = 0 end + if skip_to_endif then + if string.match(l, '^endif') then + if_nested = if_nested - 1 + + if if_nested == 0 then + skip_to_endif = false + end + elseif string.match(l, '^if') then + if_nested = if_nested + 1 + elseif string.match(l, '^else') then + -- Else counterpart for if + skip_to_endif = false + end + table.insert(complicated, l) + return + else + if string.match(l, '^ifplugin') then + skip_to_endif = true + if_nested = if_nested + 1 + table.insert(complicated, l) + elseif string.match(l, '^if !plugin%(') then + skip_to_endif = true + if_nested = if_nested + 1 + table.insert(complicated, l) + elseif string.match(l, '^if') then + -- Unknown if + skip_to_endif = true + if_nested = if_nested + 1 + table.insert(complicated, l) + elseif string.match(l, '^else') then + -- Else counterpart for if + skip_to_endif = true + table.insert(complicated, l) + elseif string.match(l, '^endif') then + if_nested = if_nested - 1 + table.insert(complicated, l) + end + end + + -- Skip comments + local words = fun.totable(fun.take_while( + function(w) return string.sub(w, 1, 1) ~= '#' end, + fun.filter(function(w) + return w ~= "" end, + fun.iter(split(l))))) + + if words[1] == "header" then + -- header SYMBOL Header ~= /regexp/ + if valid_rule then + insert_cur_rule() + end + if words[4] and (words[4] == '=~' or words[4] == '!~') then + cur_rule['type'] = 'header' + cur_rule['symbol'] = words[2] + + if words[4] == '!~' then + table.insert(complicated, l) + return + end + + cur_rule['re_expr'] = words_to_re(words, 4) + local unset_comp = string.find(cur_rule['re_expr'], '%s+%[if%-unset:') + if unset_comp then + table.insert(complicated, l) + return + end + + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + + if not cur_rule['re'] then + rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%1' for %2", + cur_rule['re_expr'], cur_rule['symbol']) + table.insert(complicated, l) + return + else + handle_header_def(words[3], cur_rule) + if not cur_rule['ordinary'] then + table.insert(complicated, l) + return + end + end + + valid_rule = true + else + table.insert(complicated, l) + return + end + elseif words[1] == "body" then + -- body SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + + cur_rule['symbol'] = words[2] + if words[3] and (string.sub(words[3], 1, 1) == '/' + or string.sub(words[3], 1, 1) == 'm') then + cur_rule['type'] = 'sabody' + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + if cur_rule['re'] then + + valid_rule = true + end + else + -- might be function + table.insert(complicated, l) + return + end + elseif words[1] == "rawbody" then + -- body SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + + cur_rule['symbol'] = words[2] + if words[3] and (string.sub(words[3], 1, 1) == '/' + or string.sub(words[3], 1, 1) == 'm') then + cur_rule['type'] = 'sarawbody' + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + if cur_rule['re'] then + valid_rule = true + end + else + table.insert(complicated, l) + return + end + elseif words[1] == "full" then + -- body SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + + cur_rule['symbol'] = words[2] + + if words[3] and (string.sub(words[3], 1, 1) == '/' + or string.sub(words[3], 1, 1) == 'm') then + cur_rule['type'] = 'message' + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + cur_rule['raw'] = true + if cur_rule['re'] then + valid_rule = true + end + else + table.insert(complicated, l) + return + end + elseif words[1] == "uri" then + -- uri SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + cur_rule['type'] = 'uri' + cur_rule['symbol'] = words[2] + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + if cur_rule['re'] and cur_rule['symbol'] then + valid_rule = true + else + table.insert(complicated, l) + return + end + elseif words[1] == "meta" then + -- meta SYMBOL expression + if valid_rule then + insert_cur_rule() + end + table.insert(complicated, l) + return + elseif words[1] == "describe" and valid_rule then + cur_rule['description'] = words_to_re(words, 2) + elseif words[1] == "score" then + scores[words[2]] = parse_score(words) + else + table.insert(complicated, l) + return + end + end)() + end + if valid_rule then + insert_cur_rule() + end +end + +for _,matched in ipairs(arg) do + local f = io.open(matched, "r") + if f then + rspamd_logger.messagex(rspamd_config, 'loading SA rules from %s', matched) + process_sa_conf(f) + else + rspamd_logger.errx(rspamd_config, "cannot open %1", matched) + end +end + +local multimap_conf = {} + +local function handle_rule(what, syms, hdr) + local mtype + local filter + local fname + local header + local sym = what:upper() + if what == 'sabody' then + mtype = 'content' + fname = 'body_re.map' + filter = 'oneline' + elseif what == 'sarawbody' then + fname = 'raw_body_re.map' + mtype = 'content' + filter = 'rawtext' + elseif what == 'full' then + fname = 'full_re.map' + mtype = 'content' + filter = 'full' + elseif what == 'uri' then + fname = 'uri_re.map' + mtype = 'url' + filter = 'full' + elseif what == 'header' then + fname = ('hdr_' .. hdr .. '_re.map'):lower() + mtype = 'header' + header = hdr + sym = sym .. '_' .. hdr:upper() + else + rspamd_logger.errx('unknown type: %s', what) + return + end + local conf = { + type = mtype, + filter = filter, + symbol = 'SA_MAP_AUTO_' .. sym, + regexp = true, + map = fname, + header = header, + symbols = {} + } + local re_file = io.open(fname, 'w') + + for k,r in pairs(syms) do + local score = 0.0 + if scores[k] then + score = scores[k] + end + re_file:write(string.format('/%s/ %s:%f\n', tostring(r.re), k, score)) + table.insert(conf.symbols, k) + end + + re_file:close() + + multimap_conf[sym:lower()] = conf + rspamd_logger.messagex('stored %s regexp in %s', sym:lower(), fname) +end + +for k,v in pairs(rules) do + if k == 'header' then + for h,r in pairs(v) do + handle_rule(k, r, h) + end + else + handle_rule(k, v) + end +end + +local out = ucl.to_format(multimap_conf, 'ucl') +local mmap_conf = io.open('auto_multimap.conf', 'w') +mmap_conf:write(out) +mmap_conf:close() +rspamd_logger.messagex('stored multimap conf in %s', 'auto_multimap.conf') + +local sa_remain = io.open('auto_sa.conf', 'w') +fun.each(function(l) + sa_remain:write(l) + sa_remain:write('\n') +end, fun.filter(function(l) return not string.match(l, '^%s+$') end, complicated)) +sa_remain:close() +rspamd_logger.messagex('stored sa remains conf in %s', 'auto_sa.conf') |