summaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /utils
parentInitial commit. (diff)
downloadrspamd-upstream.tar.xz
rspamd-upstream.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'utils')
-rw-r--r--utils/CMakeLists.txt22
-rw-r--r--utils/asn.pl331
-rw-r--r--utils/base64.c89
-rw-r--r--utils/cgp_rspamd.pl357
-rw-r--r--utils/classifier_test.pl539
-rwxr-xr-xutils/fann_train.pl247
-rw-r--r--utils/rspamd_http_bench.c411
-rw-r--r--utils/rspamd_http_server.c300
-rwxr-xr-xutils/rspamd_stats.pl1018
-rw-r--r--utils/sa_trivial_convert.lua443
10 files changed, 3757 insertions, 0 deletions
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
new file mode 100644
index 0000000..f2c6c54
--- /dev/null
+++ b/utils/CMakeLists.txt
@@ -0,0 +1,22 @@
+SET(UTILSERVERSRC rspamd_http_server.c)
+SET(UTILBENCHSRC rspamd_http_bench.c)
+SET(BASE64SRC base64.c)
+SET(MIMESRC mime_tool.c)
+
+MACRO(ADD_UTIL NAME)
+ ADD_EXECUTABLE("${NAME}" "${ARGN}")
+ SET_TARGET_PROPERTIES("${NAME}" PROPERTIES LINKER_LANGUAGE CXX)
+ TARGET_LINK_LIBRARIES("${NAME}" rspamd-server)
+ IF (ENABLE_SNOWBALL MATCHES "ON")
+ TARGET_LINK_LIBRARIES("${NAME}" stemmer)
+ ENDIF()
+ TARGET_LINK_LIBRARIES("${NAME}" rspamd-hiredis)
+ TARGET_LINK_LIBRARIES("${NAME}" ${RSPAMD_REQUIRED_LIBRARIES})
+ENDMACRO()
+
+IF (ENABLE_UTILS MATCHES "ON")
+ ADD_UTIL(rspamd-http-server ${UTILSERVERSRC})
+ ADD_UTIL(rspamd-http-bench ${UTILBENCHSRC})
+ ADD_UTIL(rspamd-base64 ${BASE64SRC})
+ ADD_UTIL(rspamd-mime-tool ${MIMESRC})
+ENDIF()
diff --git a/utils/asn.pl b/utils/asn.pl
new file mode 100644
index 0000000..4d54bad
--- /dev/null
+++ b/utils/asn.pl
@@ -0,0 +1,331 @@
+#!/usr/bin/env perl
+#
+
+use warnings;
+use strict;
+use autodie;
+
+use File::Basename;
+use File::Fetch;
+use Getopt::Long;
+use Pod::Usage;
+
+use FindBin;
+use lib "$FindBin::Bin/extlib/lib/perl5";
+
+use URI;
+
+my %config = (
+ asn_sources => [
+ 'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest',
+ 'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest',
+ 'http://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest',
+ 'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest',
+ 'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest'
+ ],
+ bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz']
+);
+
+my $download_asn = 0;
+my $download_bgp = 0;
+my $download_target = "./";
+my $help = 0;
+my $man = 0;
+my $v4 = 1;
+my $v6 = 1;
+my $parse = 1;
+my $v4_zone = "asn.rspamd.com";
+my $v6_zone = "asn6.rspamd.com";
+my $v4_file = "asn.zone";
+my $v6_file = "asn6.zone";
+my $ns_servers = [ "asn-ns.rspamd.com", "asn-ns2.rspamd.com" ];
+my $unknown_placeholder = "--";
+
+GetOptions(
+ "download-asn" => \$download_asn,
+ "download-bgp" => \$download_bgp,
+ "4!" => \$v4,
+ "6!" => \$v6,
+ "parse!" => \$parse,
+ "target=s" => \$download_target,
+ "zone-v4=s" => \$v4_zone,
+ "zone-v6=s" => \$v6_zone,
+ "file-v4=s" => \$v4_file,
+ "file-v6=s" => \$v6_file,
+ "ns-server=s@" => \$ns_servers,
+ "help|?" => \$help,
+ "man" => \$man,
+ "unknown-placeholder" => \$unknown_placeholder,
+) or
+ pod2usage(2);
+
+pod2usage(1) if $help;
+pod2usage(-exitval => 0, -verbose => 2) if $man;
+
+if ($download_asn) {
+ foreach my $u (@{ $config{'asn_sources'} }) {
+ download_file($u);
+ }
+}
+
+if ($download_bgp) {
+ foreach my $u (@{ $config{'bgp_sources'} }) {
+ download_file($u);
+ }
+}
+
+if (!$parse) {
+ exit 0;
+}
+
+# Prefix to ASN map
+my $networks = { 4 => {}, 6 => {} };
+
+foreach my $u (@{ $config{'bgp_sources'} }) {
+ my $parsed = URI->new($u);
+ my $fname = $download_target . '/' . basename($parsed->path);
+
+ use constant {
+ F_MARKER => 0,
+ F_TIMESTAMP => 1,
+ F_PEER_IP => 3,
+ F_PEER_AS => 4,
+ F_PREFIX => 5,
+ F_AS_PATH => 6,
+ F_ORIGIN => 7,
+ };
+
+ open(my $bgpd, '-|', "bgpdump -v -M $fname") or die "can't start bgpdump: $!";
+
+ while (<$bgpd>) {
+ chomp;
+ my @e = split /\|/;
+ if ($e[F_MARKER] ne 'TABLE_DUMP2') {
+ warn "bad line: $_\n";
+ next;
+ }
+
+ my $origin_as;
+ my $prefix = $e[F_PREFIX];
+ my $ip_ver = 6;
+
+ if ($prefix =~ /^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\/\d{1,2}$/) {
+ $ip_ver = 4;
+ }
+
+ if ($e[F_AS_PATH]) {
+
+ # not empty AS_PATH
+ my @as_path = split /\s/, $e[F_AS_PATH];
+ $origin_as = pop @as_path;
+
+ if (substr($origin_as, 0, 1) eq '{') {
+
+ # route is aggregated
+ if ($origin_as =~ /^{(\d+)}$/) {
+
+ # single AS aggregated, just remove { } around
+ $origin_as = $1;
+ } else {
+
+ # use previous AS from AS_PATH
+ $origin_as = pop @as_path;
+ }
+ }
+
+ # strip bogus AS
+ while (is_bougus_asn($origin_as)) {
+ $origin_as = pop @as_path;
+ last if scalar @as_path == 0;
+ }
+ }
+
+ # empty AS_PATH or all AS_PATH elements was stripped as bogus - use
+ # PEER_AS as origin AS
+ $origin_as //= $e[F_PEER_AS];
+
+ $networks->{$ip_ver}{$prefix} = int($origin_as);
+ }
+}
+
+# Remove default routes
+delete $networks->{4}{'0.0.0.0/0'};
+delete $networks->{6}{'::/0'};
+
+# Now roughly detect countries
+my $as_info = {};
+
+# RIR statistics exchange format
+# https://www.apnic.net/publications/media-library/documents/resource-guidelines/rir-statistics-exchange-format
+# https://www.arin.net/knowledge/statistics/nro_extended_stats_format.pdf
+# first 7 fields for this two formats are same
+use constant {
+ F_REGISTRY => 0, # {afrinic,apnic,arin,iana,lacnic,ripencc}
+ F_CC => 1, # ISO 3166 2-letter country code
+ F_TYPE => 2, # {asn,ipv4,ipv6}
+ F_START => 3,
+ F_VALUE => 4,
+ F_DATE => 5,
+ F_STATUS => 6,
+};
+
+foreach my $u (@{ $config{'asn_sources'} }) {
+ my $parsed = URI->new($u);
+ my $fname = $download_target . '/' . basename($parsed->path);
+ open(my $fh, "<", $fname) or die "Cannot open $fname: $!";
+
+ while (<$fh>) {
+ next if /^\#/;
+ chomp;
+ my @elts = split /\|/;
+
+ if ($elts[F_TYPE] eq 'asn' && $elts[F_START] ne '*') {
+ my $as_start = int($elts[F_START]);
+ my $as_end = $as_start + int($elts[F_VALUE]) - 1;
+
+ for my $as ($as_start .. $as_end) {
+ $as_info->{$as}{'country'} = $elts[F_CC];
+ $as_info->{$as}{'rir'} = $elts[F_REGISTRY];
+ }
+ }
+ }
+}
+
+# Write zone files
+my $ns_list = join ' ', @{$ns_servers};
+my $zone_header = << "EOH";
+\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300
+\$NS 43200 $ns_list
+EOH
+
+if ($v4) {
+ # create temp file in the same dir so we can be sure that mv is atomic
+ my $out_dir = dirname($v4_file);
+ my $out_file = basename($v4_file);
+ my $temp_file = "$out_dir/.$out_file.tmp";
+ open my $v4_fh, '>', $temp_file;
+ print $v4_fh $zone_header;
+
+ while (my ($net, $asn) = each %{ $networks->{4} }) {
+ my $country = $as_info->{$asn}{'country'} || $unknown_placeholder;
+ my $rir = $as_info->{$asn}{'rir'} || $unknown_placeholder;
+
+ # "8.8.8.0/24 15169|8.8.8.0/24|US|arin|" for 8.8.8.8
+ printf $v4_fh "%s %s|%s|%s|%s|\n", $net, $asn, $net, $country, $rir;
+ }
+
+ close $v4_fh;
+ rename $temp_file, $v4_file;
+}
+
+if ($v6) {
+ my $out_dir = dirname($v6_file);
+ my $out_file = basename($v6_file);
+ my $temp_file = "$out_dir/.$out_file.tmp";
+ open my $v6_fh, '>', $temp_file;
+ print $v6_fh $zone_header;
+
+ while (my ($net, $asn) = each %{ $networks->{6} }) {
+ my $country = $as_info->{$asn}{'country'} || $unknown_placeholder;
+ my $rir = $as_info->{$asn}{'rir'} || $unknown_placeholder;
+
+ # "2606:4700:4700::/48 13335|2606:4700:4700::/48|US|arin|" for 2606:4700:4700::1111
+ printf $v6_fh "%s %s|%s|%s|%s|\n", $net, $asn, $net, $country, $rir;
+ }
+
+ close $v6_fh;
+ rename $temp_file, $v6_file;
+}
+
+exit 0;
+
+########################################################################
+
+sub download_file {
+ my ($url) = @_;
+
+ local $File::Fetch::WARN = 0;
+ local $File::Fetch::TIMEOUT = 180; # connectivity to ftp.lacnic.net is bad
+
+ my $ff = File::Fetch->new(uri => $url);
+ my $where = $ff->fetch(to => $download_target) or
+ die "$url: ", $ff->error;
+
+ return $where;
+}
+
+# Returns true if AS number is bogus
+# e. g. a private AS.
+# List of allocated and reserved AS:
+# https://www.iana.org/assignments/as-numbers/as-numbers.txt
+sub is_bougus_asn {
+ my $as = shift;
+
+ # 64496-64511 Reserved for use in documentation and sample code
+ # 64512-65534 Designated for private use
+ # 65535 Reserved
+ # 65536-65551 Reserved for use in documentation and sample code
+ # 65552-131071 Reserved
+ return 1 if $as >= 64496 && $as <= 131071;
+
+ # Reserved (RFC6996, RFC7300, RFC7607)
+ return 1 if $as == 0 || $as >= 4200000000;
+
+ return 0;
+}
+
+__END__
+
+=head1 NAME
+
+asn.pl - download and parse ASN data for Rspamd
+
+=head1 SYNOPSIS
+
+asn.pl [options]
+
+ Options:
+ --download-asn Download ASN data from RIRs
+ --download-bgp Download BGP full view dump from RIPE RIS
+ --target Where to download files (default: current dir)
+ --zone-v4 IPv4 zone (default: asn.rspamd.com)
+ --zone-v6 IPv6 zone (default: asn6.rspamd.com)
+ --file-v4 IPv4 zone file (default: ./asn.zone)
+ --file-v6 IPv6 zone (default: ./asn6.zone)
+ --unknown-placeholder Placeholder for unknown elements (default: --)
+ --help Brief help message
+ --man Full documentation
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--download-asn>
+
+Download ASN data from RIR.
+
+=item B<--download-bgp>
+
+Download GeoIP data from Ripe
+
+=item B<--target>
+
+Specifies where to download files.
+
+=item B<--help>
+
+Print a brief help message and exits.
+
+=item B<--man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<asn.pl> is intended to download ASN data and GeoIP data and create a rbldnsd zone.
+
+=cut
+
+# vim: et:ts=4:sw=4
diff --git a/utils/base64.c b/utils/base64.c
new file mode 100644
index 0000000..d1202db
--- /dev/null
+++ b/utils/base64.c
@@ -0,0 +1,89 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "printf.h"
+#include "util.h"
+#include "cryptobox.h"
+#include "unix-std.h"
+
+static gdouble total_time = 0;
+
+
+static void
+rspamd_process_file(const gchar *fname, gint decode)
+{
+ gint fd;
+ gpointer map;
+ struct stat st;
+ guint8 *dest;
+ gsize destlen;
+
+ fd = open(fname, O_RDONLY);
+
+ if (fd == -1) {
+ rspamd_fprintf(stderr, "cannot open %s: %s", fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ if (fstat(fd, &st) == -1) {
+ rspamd_fprintf(stderr, "cannot stat %s: %s", fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0);
+ close(fd);
+
+ if (map == MAP_FAILED) {
+ rspamd_fprintf(stderr, "cannot mmap %s: %s", fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ if (decode) {
+ destlen = st.st_size / 4 * 3 + 10;
+ dest = g_malloc(destlen);
+ rspamd_cryptobox_base64_decode(map, st.st_size, dest, &destlen);
+ }
+ else {
+ dest = rspamd_encode_base64(map, st.st_size, 80, &destlen);
+ }
+
+ rspamd_printf("%*s", (gint) destlen, dest);
+ g_free(dest);
+
+ munmap(map, st.st_size);
+}
+
+int main(int argc, char **argv)
+{
+ gint i, start = 1, decode = 0;
+
+ if (argc > 2 && *argv[1] == '-') {
+ start = 2;
+
+ if (argv[1][1] == 'd') {
+ decode = 1;
+ }
+ }
+
+ for (i = start; i < argc; i++) {
+ if (argv[i]) {
+ rspamd_process_file(argv[i], decode);
+ }
+ }
+
+ return 0;
+}
diff --git a/utils/cgp_rspamd.pl b/utils/cgp_rspamd.pl
new file mode 100644
index 0000000..6898d26
--- /dev/null
+++ b/utils/cgp_rspamd.pl
@@ -0,0 +1,357 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use JSON::XS;
+use AnyEvent;
+use AnyEvent::HTTP;
+use AnyEvent::IO;
+use EV;
+use Pod::Usage;
+use Getopt::Long;
+use File::stat;
+
+my $rspamd_host = "localhost:11333";
+my $man = 0;
+my $help = 0;
+my $local = 0;
+my $header = "X-Spam: yes";
+my $max_size = 10 * 1024 * 1024; # 10 MB
+my $request_timeout = 15; # 15 seconds by default
+my $reject_message = "Spam message rejected";
+
+GetOptions(
+ "host=s" => \$rspamd_host,
+ "header=s" => \$header,
+ "reject-message=s" => \$reject_message,
+ "max-size=i" => \$max_size,
+ "timeout=f" => \$request_timeout,
+ "help|?" => \$help,
+ "man" => \$man
+) or pod2usage(2);
+
+pod2usage(1) if $help;
+pod2usage( -exitval => 0, -verbose => 2 ) if $man;
+
+my $main_domain = cgp_main_domain();
+my $scanned = 0;
+
+# Turn off bufferization as required by CGP
+$| = 1;
+
+sub cgp_main_domain {
+ if ( open( my $fh, 'Settings/Main.settings' ) ) {
+ while (<$fh>) {
+ if (/^\s+DomainName\s+=\s+([^;]+);/) {
+ return $1;
+ }
+ }
+ }
+}
+
+sub cgp_string {
+ my ($in) = @_;
+
+ $in =~ s/\"/\\"/g;
+ $in =~ s/\n/\\n/gms;
+ $in =~ s/\r/\\r/mgs;
+ $in =~ s/\t/ /g;
+
+ return "\"$in\"";
+}
+
+sub rspamd_scan {
+ my ( $tag, $file ) = @_;
+
+ my $http_callback = sub {
+ my ( $body, $hdr ) = @_;
+
+ if ( $hdr && $hdr->{Status} =~ /^2/ ) {
+ my $js = eval('decode_json($body)');
+ $scanned++;
+
+ if ( !$js ) {
+ print "* Rspamd: Bad response for $file: invalid JSON: parse error\n";
+ print "$tag FAILURE\n";
+ }
+ else {
+ my $def = $js;
+ my $headers = "";
+
+ if ( !$def ) {
+ print "* Rspamd: Bad response for $file: invalid JSON: default is missing\n";
+ print "$tag FAILURE\n";
+ }
+ else {
+ my $action = $def->{'action'};
+ my $id = $js->{'message-id'};
+
+ my $symbols = "";
+ while ( my ( $k, $s ) = each( %{ $def->{'symbols'} } ) ) {
+ $symbols .= sprintf "%s(%.2f);", $k, $s->{'score'};
+ }
+
+ printf
+ "* Rspamd: Scanned %s; id: <%s>; Score: %.2f / %.2f; Symbols: [%s]\n",
+ $file, $id, $def->{'score'}, $def->{'required_score'}, $symbols;
+
+ if ( $js->{'dkim-signature'} ) {
+ $headers .= "DKIM-Signature: " . $js->{'dkim-signature'};
+ }
+
+ if ( $js->{'milter'} ) {
+ my $block = $js->{'milter'};
+
+ if ( $block->{'add_headers'} ) {
+ while ( my ( $h, $v ) = each( %{ $block->{'add_headers'} } ) ) {
+ if ( ref($v) eq 'HASH' ) {
+ if ( $headers eq "" ) {
+ $headers .= "$h: $v->{value}";
+ }
+ else {
+ $headers .= "\\e$h: $v->{value}";
+ }
+ }
+ else {
+ if ( $headers eq "" ) {
+ $headers .= "$h: $v";
+ }
+ else {
+ $headers .= "\\e$h: $v";
+ }
+ }
+ }
+ }
+ }
+
+ if ( $action eq 'reject' ) {
+ print "$tag DISCARD\n";
+ return;
+ }
+ elsif ( $action eq 'add header' || $action eq 'rewrite subject' ) {
+ if ( $headers eq "" ) {
+ $headers .= "$header";
+ }
+ else {
+ $headers .= "\\e$header";
+ }
+ }
+ elsif ( $action eq 'soft reject' ) {
+ print "$tag REJECTED Try again later\n";
+ return;
+ }
+
+ if ( $headers eq "" ) {
+ print "$tag OK\n";
+ }
+ else {
+ print "$tag ADDHEADER " . cgp_string($headers) . " OK\n";
+ }
+ }
+ }
+ }
+ else {
+ if ($hdr) {
+ print "* Rspamd: Bad response for $file: HTTP error: $hdr->{Status} $hdr->{Reason}\n";
+ }
+ else {
+ print "* Rspamd: Bad response for $file: IO error: $!\n";
+ }
+ print "$tag FAILURE\n";
+ }
+ };
+
+ if ($local) {
+
+ # Use file scan
+ # XXX: not implemented now due to CGP queue format
+ http_get(
+ "http://$rspamd_host/symbols?file=$file",
+ timeout => $request_timeout,
+ $http_callback
+ );
+ }
+ else {
+ my $sb = stat($file);
+
+ if ( !$sb || $sb->size > $max_size ) {
+ if ($sb) {
+ print "* File $file is too large: " . $sb->size . "\n$tag FAILURE\n";
+
+ }
+ else {
+ print "* Cannot stat $file: $!\n$tag FAILURE\n";
+ }
+ return;
+ }
+ aio_load(
+ $file,
+ sub {
+ my ($data) = @_;
+
+ if ( !$data ) {
+ print "* Cannot open $file: $!\n$tag FAILURE\n";
+ return;
+ }
+
+ # Parse CGP format
+ $data =~ s/^((?:[^\n]*\n)*?)\n(.*)$/$2/ms;
+ my @envelope = split /\n/, $1;
+ chomp(@envelope);
+ my $from;
+ my @rcpts;
+ my $ip;
+ my $user;
+
+ foreach my $elt (@envelope) {
+ if ( $elt =~ /^P\s[^<]*(<[^>]*>).*$/ ) {
+ $from = $1;
+ }
+ elsif ( $elt =~ /^R\s[^<]*(<[^>]*>).*$/ ) {
+ push @rcpts, $1;
+ }
+ elsif ( $elt =~ /^S (?:<([^>]+)> )?(?:SMTP|HTTPU?|AIRSYNC|XIMSS) \[([0-9a-f.:]+)\]/ ) {
+ if ($1) {
+ $user = $1;
+ }
+ if ($2) {
+ $ip = $2;
+ }
+ }
+ elsif ( $elt =~ /^S (?:<([^>]+)> )?(?:DSN|GROUP|LIST|PBX|PIPE|RULE) \[0\.0\.0\.0\]/ ) {
+ if ($1) {
+ $user = $1;
+ }
+ $ip = '127.2.4.7';
+ }
+ }
+
+ my $headers = {};
+ if ( $file =~ /\/([^\/.]+)\.msg$/ ) {
+ $headers->{'Queue-ID'} = $1;
+ }
+ if ($from) {
+ $headers->{From} = $from;
+ }
+ if ( scalar(@rcpts) > 0 ) {
+
+ # XXX: Anyevent cannot parse headers with multiple values
+ $headers->{Rcpt} = join( ',', @rcpts );
+ }
+ if ($ip) {
+ $headers->{IP} = $ip;
+ }
+ if ($user) {
+ $headers->{User} = $user;
+ }
+ if ($main_domain) {
+ $headers->{'MTA-Tag'} = $main_domain;
+ }
+
+ http_post(
+ "http://$rspamd_host/checkv2", $data,
+ timeout => $request_timeout,
+ headers => $headers,
+ $http_callback
+ );
+ }
+ );
+ }
+}
+
+# Show informational message
+print "* Rspamd CGP filter has been started\n";
+
+my $w = AnyEvent->io(
+ fh => \*STDIN,
+ poll => 'r',
+ cb => sub {
+ chomp( my $input = <STDIN> );
+
+ if ( $input =~ /^(\d+)\s+(\S+)(\s+(\S+)\s*)?$/ ) {
+ my $tag = $1;
+ my $cmd = $2;
+
+ if ( $cmd eq "INTF" ) {
+ print "$input\n";
+ }
+ elsif ( $cmd eq "FILE" && $4 ) {
+ my $file = $4;
+ print "* Scanning file $file\n";
+ rspamd_scan $tag, $file;
+ }
+ elsif ( $cmd eq "QUIT" ) {
+ print "* Terminating after scanning of $scanned files\n";
+ print "$tag OK\n";
+ exit 0;
+ }
+ else {
+ print "* Unknown command $cmd\n";
+ print "$tag FAILURE\n";
+ }
+ }
+ }
+);
+
+EV::run;
+
+__END__
+
+=head1 NAME
+
+cgp_rspamd - implements Rspamd filter for CommunigatePro MTA
+
+=head1 SYNOPSIS
+
+cgp_rspamd [options]
+
+ Options:
+ --host=hostport Rspamd host to connect (localhost:11333 by default)
+ --header Add specific header for a spam message ("X-Spam: yes" by default)
+ --reject-message Rejection message for spam mail ("Spam message rejected" by default)
+ --timeout Timeout to read response from Rspamd (15 seconds by default)
+ --max-size Maximum size of message to scan (10 megabytes by default)
+ --help brief help message
+ --man full documentation
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--host>
+
+Specifies Rspamd host to use for scanning
+
+=item B<--header>
+
+Specifies the header that should be added when Rspamd action is B<add header> or B<rewrite subject>.
+
+=item B<--reject-message>
+
+Specifies the rejection message for spam.
+
+=item B<--timeout>
+
+Sets timeout in seconds for waiting Rspamd reply for a message.
+
+=item B<--max-size>
+
+Define the maximum messages size to be processed by Rspamd in bytes.
+
+=item B<--help>
+
+Print a brief help message and exits.
+
+=item B<--man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<cgp_rspamd> is intended to scan messages processed with B<CommunigatePro> MTA on some Rspamd scanner. It reads
+standard input and parses CGP helpers protocol. On scan requests, this filter can query Rspamd to process a message.
+B<cgp_rspamd> can tell CGP to add header or reject SPAM messages depending on Rspamd scan result.
+
+=cut
diff --git a/utils/classifier_test.pl b/utils/classifier_test.pl
new file mode 100644
index 0000000..238417f
--- /dev/null
+++ b/utils/classifier_test.pl
@@ -0,0 +1,539 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use Pod::Usage;
+use Getopt::Long;
+use Time::HiRes qw(gettimeofday tv_interval);
+use JSON::XS;
+use String::ShellQuote;
+use FileHandle;
+use IPC::Open2;
+use Data::Dumper;
+
+my $spam_dir;
+my $ham_dir;
+my $parallel = 1;
+my $classifier = "bayes";
+my $spam_symbol = "BAYES_SPAM";
+my $ham_symbol = "BAYES_HAM";
+my $timeout = 10;
+my $rspamc = $ENV{'RSPAMC'} || "rspamc";
+my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter";
+my $dspam = $ENV{'DSPAM'} || "dspam";
+my $train_fraction = 0.5;
+my $use_bogofilter = 0;
+my $use_dspam = 0;
+my $check_only = 0;
+my $rspamc_prob_trigger = 95;
+my $man;
+my $help;
+
+GetOptions(
+ "spam|s=s" => \$spam_dir,
+ "ham|h=s" => \$ham_dir,
+ "spam-symbol=s" => \$spam_symbol,
+ "ham-symbol=s" => \$ham_symbol,
+ "classifier|c=s" => \$classifier,
+ "timeout|t=f" => \$timeout,
+ "parallel|p=i" => \$parallel,
+ "train-fraction|t=f" => \$train_fraction,
+ "bogofilter|b" => \$use_bogofilter,
+ "dspam|d" => \$use_dspam,
+ "check-only" => \$check_only,
+ "help|?" => \$help,
+ "man" => \$man
+) or pod2usage(2);
+
+pod2usage(1) if $help;
+pod2usage( -exitval => 0, -verbose => 2 ) if $man;
+
+sub read_dir_files {
+ my ( $dir, $target ) = @_;
+ opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
+ while ( my $file = readdir $dh ) {
+ if ( -f "$dir/$file" ) {
+ push @{$target}, "$dir/$file";
+ }
+ }
+}
+
+sub shuffle_array {
+ my ($ar) = @_;
+
+ for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
+ if ( $i > 1 ) {
+ my $sel = int( rand( $i - 1 ) );
+ ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
+ }
+ }
+}
+
+sub learn_rspamc {
+ my ( $files, $spam ) = @_;
+ my $processed = 0;
+
+ my $cmd = $spam ? "learn_spam" : "learn_ham";
+ my $args_quoted = shell_quote @{$files};
+ open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" )
+ or die "cannot spawn $rspamc: $!";
+
+ while (<$p>) {
+ my $res = eval('decode_json($_)');
+ if ( $res && $res->{'success'} ) {
+ $processed++;
+ }
+ }
+
+ return $processed;
+}
+
+sub learn_bogofilter {
+ my ( $files, $spam ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+ my $fl = $spam ? "-s" : "-n";
+ `$bogofilter -I $args_quoted $fl`;
+ if ( $? == 0 ) {
+ $processed++;
+ }
+ }
+
+ return $processed;
+}
+
+sub learn_dspam {
+ my ( $files, $spam ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+ my $fl = $spam ? "--class=spam" : "--class=innocent";
+ open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
+ or die "cannot run $dspam: $!";
+
+ open( my $inp, "< $f" );
+ while (<$inp>) {
+ print $p $_;
+ }
+ }
+
+ return $processed;
+}
+
+sub learn_samples {
+ my ( $ar_ham, $ar_spam ) = @_;
+ my $len;
+ my $processed = 0;
+ my $total = 0;
+ my $learn_func;
+
+ my @files_spam;
+ my @files_ham;
+
+ if ($use_dspam) {
+ $learn_func = \&learn_dspam;
+ }
+ elsif ($use_bogofilter) {
+ $learn_func = \&learn_bogofilter;
+ }
+ else {
+ $learn_func = \&learn_rspamc;
+ }
+
+ $len = int( scalar @{$ar_ham} * $train_fraction );
+ my @cur_vec;
+
+ # Shuffle spam and ham samples
+ for ( my $i = 0 ; $i < $len ; $i++ ) {
+ if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
+ push @cur_vec, @{$ar_ham}[$i];
+ push @files_ham, [@cur_vec];
+ @cur_vec = ();
+ $total++;
+ }
+ else {
+ push @cur_vec, @{$ar_ham}[$i];
+ }
+ }
+
+ $len = int( scalar @{$ar_spam} * $train_fraction );
+ @cur_vec = ();
+ for ( my $i = 0 ; $i < $len ; $i++ ) {
+ if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
+ push @cur_vec, @{$ar_spam}[$i];
+ push @files_spam, [@cur_vec];
+ @cur_vec = ();
+ $total++;
+ }
+ else {
+ push @cur_vec, @{$ar_spam}[$i];
+ }
+ }
+
+ for ( my $i = 0 ; $i < $total ; $i++ ) {
+ my $args;
+ my $spam;
+
+ if ( $i % 2 == 0 ) {
+ $args = pop @files_spam;
+
+ if ( !$args ) {
+ $args = pop @files_ham;
+ $spam = 0;
+ }
+ else {
+ $spam = 1;
+ }
+ }
+ else {
+ $args = pop @files_ham;
+ if ( !$args ) {
+ $args = pop @files_spam;
+ $spam = 1;
+ }
+ else {
+ $spam = 0;
+ }
+ }
+
+ my $r = $learn_func->( $args, $spam );
+ if ($r) {
+ $processed += $r;
+ }
+ }
+
+ return $processed;
+}
+
+sub check_rspamc {
+ my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
+
+ my $args_quoted = shell_quote @{$files};
+ my $processed = 0;
+
+ open(
+ my $p,
+"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
+ ) or die "cannot spawn $rspamc: $!";
+
+ while (<$p>) {
+ my $res = eval('decode_json($_)');
+ if ( $res && $res->{'default'} ) {
+ $processed++;
+
+ if ($spam) {
+ if ( $res->{'default'}->{$ham_symbol} ) {
+ my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
+ if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
+ my $percentage = int($1);
+ if ( $percentage >= $rspamc_prob_trigger ) {
+ $$fp_cnt++;
+ }
+ }
+ else {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( !$res->{'default'}->{$spam_symbol} ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ else {
+ if ( $res->{'default'}->{$spam_symbol} ) {
+ my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
+ if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
+
+ my $percentage = int($1);
+ if ( $percentage >= $rspamc_prob_trigger ) {
+ $$fp_cnt++;
+ }
+ }
+ else {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( !$res->{'default'}->{$ham_symbol} ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ }
+ }
+
+ return $processed;
+}
+
+sub check_bogofilter {
+ my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+
+ open( my $p, "$bogofilter -t -I $args_quoted |" )
+ or die "cannot spawn $bogofilter: $!";
+
+ while (<$p>) {
+ if ( $_ =~ /^([SHU])\s+.*$/ ) {
+ $processed++;
+
+ if ($spam) {
+ if ( $1 eq 'H' ) {
+ $$fp_cnt++;
+ }
+ elsif ( $1 eq 'U' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ else {
+ if ( $1 eq 'S' ) {
+ $$fp_cnt++;
+ }
+ elsif ( $1 eq 'U' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ }
+ }
+ }
+
+ return $processed;
+}
+
+sub check_dspam {
+ my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+
+ my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" );
+ open( my $inp, "< $f" );
+ while (<$inp>) {
+ print Writer $_;
+ }
+ close Writer;
+
+ while (<Reader>) {
+ if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) {
+ $processed++;
+ my $percentage = int( $2 * 100.0 );
+
+ if ($spam) {
+ if ( $1 eq 'Innocent' ) {
+ if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( $1 ne 'Spam' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ else {
+ if ( $1 eq 'Spam' ) {
+ if ( $percentage >= $rspamc_prob_trigger ) {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( $1 ne 'Innocent' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ }
+ }
+ close Reader;
+ waitpid( $pid, 0 );
+ }
+
+ return $processed;
+}
+
+sub cross_validate {
+ my ($hr) = @_;
+ my $args = "";
+ my $processed = 0;
+ my $fp_spam = 0;
+ my $fn_spam = 0;
+ my $fp_ham = 0;
+ my $fn_ham = 0;
+ my $total_spam = 0;
+ my $total_ham = 0;
+ my $detected_spam = 0;
+ my $detected_ham = 0;
+ my $i = 0;
+ my $len = scalar keys %{$hr};
+ my @files_spam;
+ my @files_ham;
+ my @cur_spam;
+ my @cur_ham;
+ my $check_func;
+
+ if ($use_dspam) {
+ $check_func = \&check_dspam;
+ }
+ elsif ($use_bogofilter) {
+ $check_func = \&check_bogofilter;
+ }
+ else {
+ $check_func = \&check_rspamc;
+ }
+
+ while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
+ if ($spam) {
+ if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
+ push @cur_spam, $fn;
+ push @files_spam, [@cur_spam];
+ @cur_spam = ();
+ }
+ else {
+ push @cur_spam, $fn;
+ }
+ }
+ else {
+ if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
+ push @cur_ham, $fn;
+ push @files_ham, [@cur_ham];
+ @cur_ham = ();
+ }
+ else {
+ push @cur_ham, $fn;
+ }
+ }
+ }
+
+ shuffle_array( \@files_spam );
+
+ foreach my $fn (@files_spam) {
+ my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
+ $total_spam += $r;
+ $processed += $r;
+ }
+
+ shuffle_array( \@files_ham );
+
+ foreach my $fn (@files_ham) {
+ my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
+ $total_ham += $r;
+ $processed += $r;
+ }
+
+ printf "Scanned %d messages
+%d spam messages (%d detected)
+%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
+
+ printf "\nHam FP rate: %.2f%% (%d messages)
+Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham;
+
+ printf "\nSpam FP rate: %.2f%% (%d messages)
+Spam FN rate: %.2f%% (%d messages)\n",
+ $fp_spam / $total_spam * 100.0, $fp_spam,
+ $fn_spam / $total_spam * 100.0, $fn_spam;
+}
+
+if ( !$spam_dir || !$ham_dir ) {
+ die "spam or/and ham directories are not specified";
+}
+
+my @spam_samples;
+my @ham_samples;
+
+read_dir_files( $spam_dir, \@spam_samples );
+read_dir_files( $ham_dir, \@ham_samples );
+shuffle_array( \@spam_samples );
+shuffle_array( \@ham_samples );
+
+if ( !$check_only ) {
+ my $learned = 0;
+ my $t0 = [gettimeofday];
+ $learned = learn_samples( \@ham_samples, \@spam_samples );
+ my $t1 = [gettimeofday];
+
+ printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 );
+}
+
+my %validation_set;
+my $len = int( scalar @spam_samples * $train_fraction );
+for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
+ $validation_set{ $spam_samples[$i] } = 1;
+}
+
+$len = int( scalar @ham_samples * $train_fraction );
+for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
+ $validation_set{ $ham_samples[$i] } = 0;
+}
+
+cross_validate( \%validation_set );
+
+__END__
+
+=head1 NAME
+
+classifier_test.pl - test various parameters for a classifier
+
+=head1 SYNOPSIS
+
+classifier_test.pl [options]
+
+ Options:
+ --spam Directory with spam files
+ --ham Directory with ham files
+ --spam-symbol Symbol for spam (default: BAYES_SPAM)
+ --ham-symbol Symbol for ham (default: BAYES_HAM)
+ --classifier Classifier to test (default: bayes)
+ --timeout Timeout for rspamc (default: 10)
+ --parallel Parallel execution (default: 1)
+ --help Brief help message
+ --man Full documentation
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--spam>
+
+Directory with spam files.
+
+=item B<--ham>
+
+Directory with ham files.
+
+=item B<--classifier>
+
+Specifies classifier name to test.
+
+=item B<--help>
+
+Print a brief help message and exits.
+
+=item B<--man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<classifier_test.pl> is intended to test Rspamd classifier for false positives, false negatives and other parameters.
+It uses half of the corpus for training and half for cross-validation.
+
+=cut
diff --git a/utils/fann_train.pl b/utils/fann_train.pl
new file mode 100755
index 0000000..2ce422e
--- /dev/null
+++ b/utils/fann_train.pl
@@ -0,0 +1,247 @@
+#!/usr/bin/env perl
+
+# This script is a very simple prototype to learn fann from rspamd logs
+# For now, it is intended for internal use only
+
+use strict;
+use warnings FATAL => 'all';
+use AI::FANN qw(:all);
+use Getopt::Std;
+
+my %sym_idx; # Symbols by index
+my %sym_names; # Symbols by name
+my $num = 1; # Number of symbols
+my @spam;
+my @ham;
+my $max_samples = -1;
+my $split = 1;
+my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN>
+my $score_spam = 12;
+my $score_ham = -6;
+
+sub process {
+ my ( $input, $spam, $ham ) = @_;
+ my $samples = 0;
+
+ while (<$input>) {
+ if ( !$preprocessed ) {
+ if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) {
+ if ( $1 > $score_spam ) {
+ $_ = "$1:1: $3";
+ }
+ elsif ( $1 < $score_ham ) {
+ $_ = "$1:0: $3\n";
+ }
+ else {
+ # Out of boundary
+ next;
+ }
+ }
+ else {
+ # Not our log message
+ next;
+ }
+ }
+
+ $_ =~ /^(-?\d+\.?\d*):([01]):\s*(\S.*)$/;
+
+ my $is_spam = 0;
+
+ if ( $2 == 1 ) {
+ $is_spam = 1;
+ }
+
+ my @ar = split /,/, $3;
+ my %sample;
+
+ foreach my $sym (@ar) {
+ chomp $sym;
+ if ( !$sym_idx{$sym} ) {
+ $sym_idx{$sym} = $num;
+ $sym_names{$num} = $sym;
+ $num++;
+ }
+
+ $sample{ $sym_idx{$sym} } = 1;
+ }
+
+ if ($is_spam) {
+ push @{$spam}, \%sample;
+ }
+ else {
+ push @{$ham}, \%sample;
+ }
+
+ $samples++;
+ if ( $max_samples > 0 && $samples > $max_samples ) {
+ return;
+ }
+ }
+}
+
+# Shuffle array
+sub fisher_yates_shuffle {
+ my $array = shift;
+ my $i = @$array;
+
+ while ( --$i ) {
+ my $j = int rand( $i + 1 );
+ @$array[ $i, $j ] = @$array[ $j, $i ];
+ }
+}
+
+# Train network
+sub train {
+ my ( $ann, $sample, $result ) = @_;
+
+ my @row;
+
+ for ( my $i = 1 ; $i < $num ; $i++ ) {
+ if ( $sample->{$i} ) {
+ push @row, 1;
+ }
+ else {
+ push @row, 0;
+ }
+ }
+
+ #print "@row -> @{$result}\n";
+
+ $ann->train( \@row, \@{$result} );
+}
+
+sub test {
+ my ( $ann, $sample ) = @_;
+
+ my @row;
+
+ for ( my $i = 1 ; $i < $num ; $i++ ) {
+ if ( $sample->{$i} ) {
+ push @row, 1;
+ }
+ else {
+ push @row, 0;
+ }
+ }
+
+ my $ret = $ann->run( \@row );
+
+ return $ret;
+}
+
+my %opts;
+getopts( 'o:i:s:n:t:hpS:H:', \%opts );
+
+if ( $opts{'h'} ) {
+ print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n";
+ exit;
+}
+
+my $input = *STDIN;
+
+if ( $opts{'i'} ) {
+ open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}";
+}
+
+if ( $opts{'n'} ) {
+ $max_samples = $opts{'n'};
+}
+
+if ( $opts{'t'} ) {
+
+ # Test split
+ $split = $opts{'t'};
+}
+if ( $opts{'p'} ) {
+ $preprocessed = 1;
+}
+
+if ( $opts{'H'} ) {
+ $score_ham = $opts{'H'};
+}
+
+if ( $opts{'S'} ) {
+ $score_spam = $opts{'S'};
+}
+
+# ham_prob, spam_prob
+my @spam_out = (1);
+my @ham_out = (0);
+
+process( $input, \@spam, \@ham );
+fisher_yates_shuffle( \@spam );
+fisher_yates_shuffle( \@ham );
+
+my $nspam = int( scalar(@spam) / $split );
+my $nham = int( scalar(@ham) / $split );
+
+my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 );
+
+my @train_data;
+
+# Train ANN
+for ( my $i = 0 ; $i < $nham ; $i++ ) {
+ push @train_data, [ $ham[$i], \@ham_out ];
+}
+
+for ( my $i = 0 ; $i < $nspam ; $i++ ) {
+ push @train_data, [ $spam[$i], \@spam_out ];
+}
+
+fisher_yates_shuffle( \@train_data );
+
+foreach my $train_row (@train_data) {
+ train( $ann, @{$train_row}[0], @{$train_row}[1] );
+}
+
+print "Trained $nspam SPAM and $nham HAM samples\n";
+
+# Now run fann
+if ( $split > 1 ) {
+ my $sample = 0.0;
+ my $correct = 0.0;
+ for ( my $i = $nham ; $i < $nham * $split ; $i++ ) {
+ my $ret = test( $ann, $ham[$i] );
+
+ #print "@{$ret}\n";
+ if ( @{$ret}[0] < 0.5 ) {
+ $correct++;
+ }
+ $sample++;
+ }
+
+ print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
+
+ $sample = 0.0;
+ $correct = 0.0;
+
+ for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) {
+ my $ret = test( $ann, $spam[$i] );
+
+ #print "@{$ret}\n";
+ if ( @{$ret}[0] > 0.5 ) {
+ $correct++;
+ }
+ $sample++;
+ }
+
+ print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
+}
+
+if ( $opts{'o'} ) {
+ $ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}";
+}
+
+if ( $opts{'s'} ) {
+ open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}";
+ print $scores "{";
+ for ( my $i = 1 ; $i < $num ; $i++ ) {
+ my $n = $i - 1;
+ if ( $i != $num - 1 ) {
+ print $scores "\"$sym_names{$i}\":$n,";
+ }
+ else {
+ print $scores "\"$sym_names{$i}\":$n}\n";
+ }
+ }
+}
diff --git a/utils/rspamd_http_bench.c b/utils/rspamd_http_bench.c
new file mode 100644
index 0000000..232fc8a
--- /dev/null
+++ b/utils/rspamd_http_bench.c
@@ -0,0 +1,411 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "rspamd.h"
+#include "util.h"
+#include "libutil/http.h"
+#include "libutil/http_private.h"
+#include "ottery.h"
+#include "cryptobox.h"
+#include "unix-std.h"
+#include <math.h>
+#include <netinet/tcp.h>
+
+#ifdef HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+static guint port = 43000;
+static gchar *host = "127.0.0.1";
+static gchar *server_key = NULL;
+static guint cache_size = 10;
+static guint nworkers = 1;
+static gboolean openssl_mode = FALSE;
+static guint file_size = 500;
+static guint pconns = 100;
+static gdouble test_time = 10.0;
+static gchar *latencies_file = NULL;
+static gboolean csv_output = FALSE;
+
+/* Dynamic vars */
+static rspamd_inet_addr_t *addr;
+static guint32 workers_left = 0;
+static guint32 *conns_done = NULL;
+static const guint store_latencies = 1000;
+static guint32 conns_pending = 0;
+
+static GOptionEntry entries[] = {
+ {"port", 'p', 0, G_OPTION_ARG_INT, &port,
+ "Port number (default: 43000)", NULL},
+ {"cache", 'c', 0, G_OPTION_ARG_INT, &cache_size,
+ "Keys cache size (default: 10)", NULL},
+ {"workers", 'n', 0, G_OPTION_ARG_INT, &nworkers,
+ "Number of workers to start (default: 1)", NULL},
+ {"size", 's', 0, G_OPTION_ARG_INT, &file_size,
+ "Size of payload to transfer (default: 500)", NULL},
+ {"conns", 'C', 0, G_OPTION_ARG_INT, &pconns,
+ "Number of parallel connections (default: 100)", NULL},
+ {"time", 't', 0, G_OPTION_ARG_DOUBLE, &test_time,
+ "Time to run tests (default: 10.0 sec)", NULL},
+ {"openssl", 'o', 0, G_OPTION_ARG_NONE, &openssl_mode,
+ "Use openssl crypto", NULL},
+ {"host", 'h', 0, G_OPTION_ARG_STRING, &host,
+ "Connect to the specified host (default: localhost)", NULL},
+ {"key", 'k', 0, G_OPTION_ARG_STRING, &server_key,
+ "Use the specified key (base32 encoded)", NULL},
+ {"latency", 'l', 0, G_OPTION_ARG_FILENAME, &latencies_file,
+ "Write latencies to the specified file", NULL},
+ {"csv", 0, 0, G_OPTION_ARG_NONE, &csv_output,
+ "Output CSV", NULL},
+ {NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL}};
+
+struct lat_elt {
+ gdouble lat;
+ guchar checked;
+};
+
+static struct lat_elt *latencies;
+
+static gint
+rspamd_client_body(struct rspamd_http_connection *conn,
+ struct rspamd_http_message *msg,
+ const gchar *chunk, gsize len)
+{
+ g_assert(chunk[0] == '\0');
+
+ return 0;
+}
+
+struct client_cbdata {
+ struct lat_elt *lat;
+ guint32 *wconns;
+ gdouble ts;
+ struct ev_loop *ev_base;
+};
+
+static void
+rspamd_client_err(struct rspamd_http_connection *conn, GError *err)
+{
+ msg_info("abnormally closing connection from: error: %s",
+ err->message);
+
+ g_assert(0);
+ close(conn->fd);
+ rspamd_http_connection_unref(conn);
+}
+
+static gint
+rspamd_client_finish(struct rspamd_http_connection *conn,
+ struct rspamd_http_message *msg)
+{
+ struct client_cbdata *cb = conn->ud;
+
+ cb->lat->lat = rspamd_get_ticks() - cb->ts;
+ cb->lat->checked = TRUE;
+ (*cb->wconns)++;
+ conns_pending--;
+ close(conn->fd);
+ rspamd_http_connection_unref(conn);
+ g_free(cb);
+
+ if (conns_pending == 0) {
+ event_base_loopexit(cb->ev_base, NULL);
+ }
+
+ return 0;
+}
+
+static void
+rspamd_http_client_func(struct ev_loop *ev_base, struct lat_elt *latency,
+ guint32 *wconns,
+ struct rspamd_cryptobox_pubkey *peer_key,
+ struct rspamd_cryptobox_keypair *client_key,
+ struct rspamd_keypair_cache *c)
+{
+ struct rspamd_http_message *msg;
+ struct rspamd_http_connection *conn;
+ gchar urlbuf[PATH_MAX];
+ struct client_cbdata *cb;
+ gint fd, flags;
+
+ fd = rspamd_inet_address_connect(addr, SOCK_STREAM, TRUE);
+ g_assert(fd != -1);
+ flags = 1;
+ (void) setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &flags, sizeof(flags));
+ conn = rspamd_http_connection_new(rspamd_client_body,
+ rspamd_client_err,
+ rspamd_client_finish,
+ RSPAMD_HTTP_CLIENT_SIMPLE,
+ RSPAMD_HTTP_CLIENT,
+ c,
+ NULL);
+ rspamd_snprintf(urlbuf, sizeof(urlbuf), "http://%s/%d", host, file_size);
+ msg = rspamd_http_message_from_url(urlbuf);
+
+ g_assert(conn != NULL && msg != NULL);
+
+ if (peer_key != NULL) {
+ g_assert(client_key != NULL);
+ rspamd_http_connection_set_key(conn, client_key);
+ msg->peer_key = rspamd_pubkey_ref(peer_key);
+ }
+
+ cb = g_malloc(sizeof(*cb));
+ cb->ts = rspamd_get_ticks();
+ cb->lat = latency;
+ cb->ev_base = ev_base;
+ cb->wconns = wconns;
+ latency->checked = FALSE;
+ rspamd_http_connection_write_message(conn, msg, NULL, NULL, cb,
+ fd, NULL, ev_base);
+}
+
+static void
+rspamd_worker_func(struct lat_elt *plat, guint32 *wconns)
+{
+ guint i, j;
+ struct ev_loop *ev_base;
+ struct itimerval itv;
+ struct rspamd_keypair_cache *c = NULL;
+ struct rspamd_cryptobox_keypair *client_key = NULL;
+ struct rspamd_cryptobox_pubkey *peer_key = NULL;
+
+ if (server_key) {
+ peer_key = rspamd_pubkey_from_base32(server_key, 0, RSPAMD_KEYPAIR_KEX,
+ openssl_mode ? RSPAMD_CRYPTOBOX_MODE_NIST : RSPAMD_CRYPTOBOX_MODE_25519);
+ g_assert(peer_key != NULL);
+ client_key = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX,
+ openssl_mode ? RSPAMD_CRYPTOBOX_MODE_NIST : RSPAMD_CRYPTOBOX_MODE_25519);
+
+ if (cache_size > 0) {
+ c = rspamd_keypair_cache_new(cache_size);
+ }
+ }
+
+ memset(&itv, 0, sizeof(itv));
+ double_to_tv(test_time, &itv.it_value);
+
+ ev_base = event_init();
+ g_assert(setitimer(ITIMER_REAL, &itv, NULL) != -1);
+
+ for (i = 0;; i = (i + 1) % store_latencies) {
+ for (j = 0; j < pconns; j++) {
+ rspamd_http_client_func(ev_base, &plat[i * pconns + j],
+ wconns, peer_key, client_key, c);
+ }
+
+ conns_pending = pconns;
+
+ event_base_loop(ev_base, 0);
+ }
+}
+
+static int
+cmpd(const void *p1, const void *p2)
+{
+ const struct lat_elt *d1 = p1, *d2 = p2;
+
+ return (d1->lat) - (d2->lat);
+}
+
+double
+rspamd_http_calculate_mean(struct lat_elt *lats, double *std)
+{
+ guint i, cnt, checked = 0;
+ gdouble mean = 0., dev = 0.;
+
+ cnt = store_latencies * pconns;
+ qsort(lats, cnt, sizeof(*lats), cmpd);
+
+ for (i = 0; i < cnt; i++) {
+ if (lats[i].checked) {
+ mean += lats[i].lat;
+ checked++;
+ }
+ }
+
+ g_assert(checked > 0);
+ mean /= checked;
+
+ for (i = 0; i < cnt; i++) {
+ if (lats[i].checked) {
+ dev += pow((lats[i].lat - mean), 2);
+ }
+ }
+
+ dev /= checked;
+
+ *std = sqrt(dev);
+ return mean;
+}
+
+static void
+rspamd_http_start_workers(pid_t *sfd)
+{
+ guint i;
+ for (i = 0; i < nworkers; i++) {
+ sfd[i] = fork();
+ g_assert(sfd[i] != -1);
+
+ if (sfd[i] == 0) {
+ gperf_profiler_init(NULL, "http-bench");
+ rspamd_worker_func(&latencies[i * pconns * store_latencies],
+ &conns_done[i]);
+ gperf_profiler_stop();
+ exit(EXIT_SUCCESS);
+ }
+
+ workers_left++;
+ }
+}
+
+static void
+rspamd_http_stop_workers(pid_t *sfd)
+{
+ guint i;
+ gint res;
+
+ for (i = 0; i < nworkers; i++) {
+ kill(sfd[i], SIGTERM);
+ wait(&res);
+ }
+}
+
+static void
+rspamd_http_bench_term(int fd, short what, void *arg)
+{
+ pid_t *sfd = arg;
+
+ rspamd_http_stop_workers(sfd);
+ event_loopexit(NULL);
+}
+
+static void
+rspamd_http_bench_cld(int fd, short what, void *arg)
+{
+ gint res;
+
+ while (waitpid(-1, &res, WNOHANG) > 0) {
+ if (--workers_left == 0) {
+ event_loopexit(NULL);
+ }
+ }
+}
+
+
+int main(int argc, char **argv)
+{
+ GOptionContext *context;
+ GError *error = NULL;
+ pid_t *sfd;
+ struct ev_loop *ev_base;
+ rspamd_mempool_t *pool = rspamd_mempool_new(8192, "http-bench");
+ struct event term_ev, int_ev, cld_ev;
+ guint64 total_done;
+ FILE *lat_file;
+ gdouble mean, std;
+ guint i;
+
+ rspamd_init_libs();
+
+ context = g_option_context_new(
+ "rspamd-http-bench - test server for benchmarks");
+ g_option_context_set_summary(context,
+ "Summary:\n Rspamd test HTTP benchmark " RVERSION
+ "\n Release id: " RID);
+ g_option_context_add_main_entries(context, entries, NULL);
+
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ rspamd_fprintf(stderr, "option parsing failed: %s\n", error->message);
+ g_error_free(error);
+ exit(EXIT_FAILURE);
+ }
+
+ rspamd_parse_inet_address(&addr, host, 0);
+ g_assert(addr != NULL);
+ rspamd_inet_address_set_port(addr, port);
+
+ latencies = rspamd_mempool_alloc_shared(pool,
+ nworkers * pconns * store_latencies * sizeof(*latencies));
+ sfd = g_malloc(sizeof(*sfd) * nworkers);
+ conns_done = rspamd_mempool_alloc_shared(pool, sizeof(guint32) * nworkers);
+ memset(conns_done, 0, sizeof(guint32) * nworkers);
+
+ rspamd_http_start_workers(sfd);
+
+ ev_base = event_init();
+
+ event_set(&term_ev, SIGTERM, EV_SIGNAL, rspamd_http_bench_term, sfd);
+ event_base_set(ev_base, &term_ev);
+ event_add(&term_ev, NULL);
+ event_set(&int_ev, SIGINT, EV_SIGNAL, rspamd_http_bench_term, sfd);
+ event_base_set(ev_base, &int_ev);
+ event_add(&int_ev, NULL);
+ event_set(&cld_ev, SIGCHLD, EV_SIGNAL | EV_PERSIST,
+ rspamd_http_bench_cld, NULL);
+ event_base_set(ev_base, &cld_ev);
+ event_add(&cld_ev, NULL);
+
+ event_base_loop(ev_base, 0);
+
+ total_done = 0;
+ for (i = 0; i < nworkers; i++) {
+ total_done += conns_done[i];
+ }
+
+ mean = rspamd_http_calculate_mean(latencies, &std);
+
+ if (!csv_output) {
+ rspamd_printf(
+ "Made %L connections of size %d in %.6fs, %.6f cps, %.6f MB/sec\n",
+ total_done,
+ file_size,
+ test_time,
+ total_done / test_time,
+ total_done * file_size / test_time / (1024.0 * 1024.0));
+ rspamd_printf("Latency: %.6f ms mean, %.6f dev\n",
+ mean * 1000.0, std * 1000.0);
+ }
+ else {
+ /* size,connections,time,mean,stddev,conns,workers */
+ rspamd_printf("%ud,%L,%.1f,%.6f,%.6f,%ud,%ud\n",
+ file_size,
+ total_done,
+ test_time,
+ mean * 1000.0,
+ std * 1000.0,
+ pconns,
+ nworkers);
+ }
+
+ if (latencies_file) {
+ lat_file = fopen(latencies_file, "w");
+
+ if (lat_file) {
+ for (i = 0; i < store_latencies * pconns; i++) {
+ if (latencies[i].checked) {
+ rspamd_fprintf(lat_file, "%.6f\n", latencies[i].lat);
+ }
+ }
+
+ fclose(lat_file);
+ }
+ }
+
+ rspamd_mempool_delete(pool);
+
+ return 0;
+}
diff --git a/utils/rspamd_http_server.c b/utils/rspamd_http_server.c
new file mode 100644
index 0000000..ecd1d38
--- /dev/null
+++ b/utils/rspamd_http_server.c
@@ -0,0 +1,300 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "rspamd.h"
+#include "util.h"
+#include "libutil/fstring.h"
+#include "libutil/http.h"
+#include "libutil/http_private.h"
+#include "ottery.h"
+#include "cryptobox.h"
+#include "keypair.h"
+#include "unix-std.h"
+#include <math.h>
+
+#ifdef HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+static guint port = 43000;
+static guint cache_size = 10;
+static guint nworkers = 1;
+static gboolean openssl_mode = FALSE;
+static GHashTable *maps = NULL;
+static gchar *key = NULL;
+static struct rspamd_keypair_cache *c;
+static struct rspamd_cryptobox_keypair *server_key;
+static struct timeval io_tv = {
+ .tv_sec = 20,
+ .tv_usec = 0};
+
+static GOptionEntry entries[] = {
+ {"port", 'p', 0, G_OPTION_ARG_INT, &port,
+ "Port number (default: 43000)", NULL},
+ {"cache", 'c', 0, G_OPTION_ARG_INT, &cache_size,
+ "Keys cache size (default: 10)", NULL},
+ {"workers", 'n', 0, G_OPTION_ARG_INT, &nworkers,
+ "Number of workers to start (default: 1)", NULL},
+ {"openssl", 'o', 0, G_OPTION_ARG_NONE, &openssl_mode,
+ "Use openssl crypto", NULL},
+ {"key", 'k', 0, G_OPTION_ARG_STRING, &key,
+ "Use static keypair instead of new one (base32 encoded sk || pk)", NULL},
+ {NULL, 0, 0, G_OPTION_ARG_NONE, NULL, NULL, NULL}};
+
+struct rspamd_http_server_session {
+ struct rspamd_http_connection *conn;
+ struct ev_loop *ev_base;
+ guint req_size;
+ gboolean reply;
+ gint fd;
+};
+
+static void
+rspamd_server_error(struct rspamd_http_connection *conn,
+ GError *err)
+{
+ struct rspamd_http_server_session *session = conn->ud;
+
+ rspamd_fprintf(stderr, "http error occurred: %s\n", err->message);
+ rspamd_http_connection_unref(conn);
+ close(session->fd);
+ g_slice_free1(sizeof(*session), session);
+}
+
+static int
+rspamd_server_finish(struct rspamd_http_connection *conn,
+ struct rspamd_http_message *msg)
+{
+ struct rspamd_http_server_session *session = conn->ud;
+ struct rspamd_http_message *reply;
+ gulong size;
+ const gchar *url_str;
+ guint url_len;
+ rspamd_fstring_t *body;
+
+ if (!session->reply) {
+ session->reply = TRUE;
+ reply = rspamd_http_new_message(HTTP_RESPONSE);
+ url_str = msg->url->str;
+ url_len = msg->url->len;
+
+ if (url_str[0] == '/') {
+ url_str++;
+ url_len--;
+ }
+
+ if (rspamd_strtoul(url_str, url_len, &size)) {
+ session->req_size = size;
+
+ reply->code = 200;
+ reply->status = rspamd_fstring_new_init("OK", 2);
+ body = rspamd_fstring_sized_new(size);
+ body->len = size;
+ memset(body->str, 0, size);
+ rspamd_http_message_set_body_from_fstring_steal(msg, body);
+ }
+ else {
+ reply->code = 404;
+ reply->status = rspamd_fstring_new_init("Not found", 9);
+ }
+
+ rspamd_http_connection_reset(conn);
+ rspamd_http_connection_write_message(conn, reply, NULL,
+ "application/octet-stream", session, session->fd,
+ &io_tv, session->ev_base);
+ }
+ else {
+ /* Destroy session */
+ rspamd_http_connection_unref(conn);
+ close(session->fd);
+ g_slice_free1(sizeof(*session), session);
+ }
+
+ return 0;
+}
+
+static void
+rspamd_server_accept(gint fd, short what, void *arg)
+{
+ struct ev_loop *ev_base = arg;
+ struct rspamd_http_server_session *session;
+ rspamd_inet_addr_t *addr;
+ gint nfd;
+
+ do {
+ if ((nfd =
+ rspamd_accept_from_socket(fd, &addr, NULL)) == -1) {
+ rspamd_fprintf(stderr, "accept failed: %s", strerror(errno));
+ return;
+ }
+ /* Check for EAGAIN */
+ if (nfd == 0) {
+ rspamd_inet_address_free(addr);
+ return;
+ }
+
+ rspamd_inet_address_free(addr);
+ session = g_slice_alloc(sizeof(*session));
+ session->conn = rspamd_http_connection_new(NULL,
+ rspamd_server_error,
+ rspamd_server_finish,
+ 0,
+ RSPAMD_HTTP_SERVER,
+ c,
+ NULL);
+ rspamd_http_connection_set_key(session->conn, server_key);
+ rspamd_http_connection_read_message(session->conn,
+ session,
+ nfd,
+ &io_tv,
+ ev_base);
+ session->reply = FALSE;
+ session->fd = nfd;
+ session->ev_base = ev_base;
+ } while (nfd > 0);
+}
+
+static void
+rspamd_http_term_handler(gint fd, short what, void *arg)
+{
+ struct ev_loop *ev_base = arg;
+ struct timeval tv = {0, 0};
+
+ event_base_loopexit(ev_base, &tv);
+}
+
+static void
+rspamd_http_server_func(gint fd, rspamd_inet_addr_t *addr)
+{
+ struct ev_loop *ev_base = event_init();
+ struct event accept_ev, term_ev;
+
+ event_set(&accept_ev, fd, EV_READ | EV_PERSIST, rspamd_server_accept, ev_base);
+ event_base_set(ev_base, &accept_ev);
+ event_add(&accept_ev, NULL);
+
+ evsignal_set(&term_ev, SIGTERM, rspamd_http_term_handler, ev_base);
+ event_base_set(ev_base, &term_ev);
+ event_add(&term_ev, NULL);
+
+ event_base_loop(ev_base, 0);
+}
+
+static void
+rspamd_http_start_servers(pid_t *sfd, rspamd_inet_addr_t *addr)
+{
+ guint i;
+ gint fd;
+
+ fd = rspamd_inet_address_listen(addr, SOCK_STREAM, TRUE);
+ g_assert(fd != -1);
+
+ for (i = 0; i < nworkers; i++) {
+ sfd[i] = fork();
+ g_assert(sfd[i] != -1);
+
+ if (sfd[i] == 0) {
+ rspamd_http_server_func(fd, addr);
+ exit(EXIT_SUCCESS);
+ }
+ }
+
+ close(fd);
+}
+
+static void
+rspamd_http_stop_servers(pid_t *sfd)
+{
+ guint i;
+ gint res;
+
+ for (i = 0; i < nworkers; i++) {
+ kill(sfd[i], SIGTERM);
+ wait(&res);
+ }
+}
+
+static void
+rspamd_http_server_term(int fd, short what, void *arg)
+{
+ pid_t *sfd = arg;
+
+ rspamd_http_stop_servers(sfd);
+ event_loopexit(NULL);
+}
+
+int main(int argc, gchar **argv)
+{
+ GOptionContext *context;
+ GError *error = NULL;
+ struct ev_loop *ev_base;
+ GString *b32_key;
+ pid_t *sfd;
+ rspamd_inet_addr_t *addr;
+ struct event term_ev, int_ev;
+ struct in_addr ina = {INADDR_ANY};
+
+ rspamd_init_libs();
+
+ context = g_option_context_new(
+ "rspamd-http-server - test server for benchmarks");
+ g_option_context_set_summary(context,
+ "Summary:\n Rspamd test HTTP server " RVERSION
+ "\n Release id: " RID);
+ g_option_context_add_main_entries(context, entries, NULL);
+
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ rspamd_fprintf(stderr, "option parsing failed: %s\n", error->message);
+ g_error_free(error);
+ exit(EXIT_FAILURE);
+ }
+
+ maps = g_hash_table_new(g_int_hash, g_int_equal);
+
+ if (key == NULL) {
+ server_key = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX,
+ openssl_mode ? RSPAMD_CRYPTOBOX_MODE_NIST : RSPAMD_CRYPTOBOX_MODE_25519);
+ b32_key = rspamd_keypair_print(server_key,
+ RSPAMD_KEYPAIR_PUBKEY | RSPAMD_KEYPAIR_BASE32);
+ rspamd_printf("key: %v\n", b32_key);
+ }
+ else {
+ /* TODO: add key loading */
+ }
+
+ if (cache_size > 0) {
+ c = rspamd_keypair_cache_new(cache_size);
+ }
+
+ sfd = g_alloca(sizeof(*sfd) * nworkers);
+ addr = rspamd_inet_address_new(AF_INET, &ina);
+ rspamd_inet_address_set_port(addr, port);
+ rspamd_http_start_servers(sfd, addr);
+
+ /* Just wait for workers */
+ ev_base = event_init();
+
+ event_set(&term_ev, SIGTERM, EV_SIGNAL, rspamd_http_server_term, sfd);
+ event_base_set(ev_base, &term_ev);
+ event_add(&term_ev, NULL);
+ event_set(&int_ev, SIGINT, EV_SIGNAL, rspamd_http_server_term, sfd);
+ event_base_set(ev_base, &int_ev);
+ event_add(&int_ev, NULL);
+
+ event_base_loop(ev_base, 0);
+
+ return 0;
+}
diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl
new file mode 100755
index 0000000..9c5f2ac
--- /dev/null
+++ b/utils/rspamd_stats.pl
@@ -0,0 +1,1018 @@
+#!/usr/bin/env perl
+
+use 5.010;
+use Data::Dumper;
+use Getopt::Long;
+use Pod::Usage;
+use Time::Local;
+use IO::Handle;
+use warnings;
+use strict;
+
+my @symbols_search;
+my @symbols_exclude;
+my @symbols_bidirectional;
+my @symbols_groups;
+my @symbols_ignored;
+my %symbols_mult;
+my %groups;
+my $reject_score = 15.0;
+my $junk_score = 6.0;
+my $diff_alpha = 0.1;
+my $correlations = 0;
+my $nrelated = 10;
+my $log_file = "";
+my $search_pattern = "";
+my $startTime = "";
+my $endTime;
+my $num_logs;
+my $exclude_logs = 0;
+my $man = 0;
+my $json = 0;
+my $help = 0;
+
+# Associate file extensions with decompressors
+my %decompressor = (
+ 'bz2' => 'bzip2 -cd',
+ 'gz' => 'gzip -cd',
+ 'xz' => 'xz -cd',
+ 'zst' => 'zstd -cd',
+);
+
+GetOptions(
+ "reject-score|r=f" => \$reject_score,
+ "junk-score|j=f" => \$junk_score,
+ "symbol|s=s@" => \@symbols_search,
+ "symbol-bidir|S=s@" => \@symbols_bidirectional,
+ "exclude|X=s@" => \@symbols_exclude,
+ "ignore=s@" => \@symbols_ignored,
+ "group|g=s@" => \@symbols_groups,
+ "log|l=s" => \$log_file,
+ "mult=s" => \%symbols_mult,
+ "alpha-score|alpha|a=f" => \$diff_alpha,
+ "correlations|c" => \$correlations,
+ "nrelated=i" => \$nrelated,
+ "search-pattern=s" => \$search_pattern,
+ "start=s" => \$startTime,
+ "end=s" => \$endTime,
+ "num-logs|n=i" => \$num_logs,
+ "exclude-logs|x=i" => \$exclude_logs,
+ "json|j" => \$json,
+ "help|?" => \$help,
+ "man" => \$man
+) or pod2usage(2);
+
+pod2usage(1) if $help;
+pod2usage( -exitval => 0, -verbose => 2 ) if $man;
+
+# Global vars
+my $total = 0;
+my $total_spam = 0;
+my $total_junk = 0;
+my $junk_symbols = 0;
+my $spam_symbols = 0;
+my $ham_symbols = 0;
+my $ham_spam_change = 0;
+my $ham_junk_change = 0;
+my %sym_res;
+my $rspamd_log;
+my $enabled = 0;
+my $log_file_num = 1;
+my $spinner_update_time = 0;
+
+my %action;
+my %timeStamp;
+my %scanTime = (
+ max => 0,
+ total => 0,
+);
+my %bidir_match;
+
+foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) }
+
+# Convert bidirectional symbols
+foreach my $s (@symbols_bidirectional) {
+ $bidir_match{$s} = {
+ spam => "${s}_SPAM",
+ ham => "${s}_HAM",
+ };
+ push @symbols_search, $s unless grep /^$s$/, @symbols_search;
+}
+
+# Deal with groups
+my $group_id = 0;
+foreach my $g (@symbols_groups) {
+ my @symbols = split /,/, $g;
+ my $group_name = "group$group_id";
+
+ foreach my $s (@symbols) {
+ $groups{$s} = $group_name;
+ push @symbols_search, $s unless grep /^$s$/, @symbols_search;
+ }
+}
+
+@symbols_search = '.*'
+ unless @symbols_search;
+
+if ( $log_file eq '-' || $log_file eq '' ) {
+ $rspamd_log = \*STDIN;
+ &ProcessLog();
+}
+elsif ( -d "$log_file" ) {
+ my $log_dir = "$log_file";
+
+ my @logs = &GetLogfilesList($log_dir);
+
+ # Process logs
+ foreach (@logs) {
+ my $ext = (/[^.]+\.?([^.]*?)$/)[0];
+ my $dc = $decompressor{$ext} || 'cat';
+
+ open( $rspamd_log, "-|", "$dc $log_dir/$_" )
+ or die "cannot execute $dc $log_dir/$_ : $!";
+
+ printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs,
+ $_;
+ $spinner_update_time = 0; # Force spinner update
+ &spinner;
+
+ &ProcessLog;
+
+ close($rspamd_log)
+ or warn "cannot close $dc $log_dir/$_: $!";
+ }
+ print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up
+}
+else {
+ my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0];
+ my $dc = $decompressor{$ext} || 'cat';
+ open( $rspamd_log, "-|", "$dc $log_file" )
+ or die "cannot execute $dc $log_file : $!";
+ $spinner_update_time = 0; # Force spinner update
+ &spinner;
+ &ProcessLog();
+}
+
+my $total_ham = $total - ( $total_spam + $total_junk );
+
+if ($json) {
+ print "{";
+ &Summary();
+ print '"symbols":{';
+ &SymbolsStat();
+ print "}}\n";
+}
+else {
+ &SymbolsStat();
+ &Summary();
+}
+
+exit;
+
+sub IsIgnored {
+ my ($sym) = @_;
+
+ foreach my $ex (@symbols_ignored) {
+ if ( $sym =~ /^$ex$/ ) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+sub GenRelated {
+ my ( $htb, $target_sym ) = @_;
+
+ my @result;
+ my $i = 0;
+ foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) {
+ if ( $sym ne $target_sym ) {
+ my @elt = ( $sym, $htb->{$sym} );
+ push @result, \@elt;
+ $i++;
+ }
+
+ last if $i > $nrelated;
+ }
+
+ return \@result;
+}
+
+sub StringifyRelated {
+ my ( $ar, $total ) = @_;
+ return
+ join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) );
+}
+
+sub SymbolsStat {
+ if ( $total > 0 ) {
+ my $has_comma = 0;
+ while ( my ( $s, $r ) = each(%sym_res) ) {
+ if ( $r->{hits} > 0 ) {
+ my $th = $r->{hits};
+ my $sh = $r->{spam_hits};
+ my $jh = $r->{junk_hits};
+ my $hh = $r->{hits} - $sh - $jh;
+ my ( $htp, $stp, $jtp );
+ $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
+ $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
+ $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
+
+ if ($json) {
+ if ($has_comma) {
+ print ",";
+ }
+ else {
+ $has_comma = 1;
+ }
+ print "\"$s\":{";
+ JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" );
+ print ",";
+ JsonObjectElt( "hits", $th, "%d" );
+ print ",";
+ JsonObjectElt( "hits_percentage", $th / $total, "%.4f" );
+ print ",";
+ JsonObjectElt( "spam_hits", $sh, "%d" );
+ print ",";
+ JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" );
+ print ",";
+ JsonObjectElt( "ham_hits", $hh, "%d" );
+ print ",";
+ JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_hits", $jh, "%d" );
+ print ",";
+ JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" );
+ }
+ else {
+ printf "%s avg. weight %.3f, hits %d(%.3f%%):
+ Ham %7.3f%%, %6d/%-6d (%7.3f%%)
+ Spam %7.3f%%, %6d/%-6d (%7.3f%%)
+ Junk %7.3f%%, %6d/%-6d (%7.3f%%)
+", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ),
+ ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ),
+ ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ),
+ ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 );
+ }
+ my ( $schp, $jchp );
+ $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
+ $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
+
+ if ( $r->{weight} != 0 ) {
+ if ( !$json ) {
+ if ( $r->{weight} > 0 ) {
+ printf "
+Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%)
+Spam changes / total spam hits: %6d/%-6d (%7.3f%%)
+Junk changes (ham -> junk): %6d/%-6d (%7.3f%%)
+Junk changes / total junk hits: %6d/%-6d (%7.3f%%)
+",
+ $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
+ $r->{spam_change}, $total_spam, ( $schp or 0 ),
+ $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
+ $r->{junk_change}, $total_junk, ( $jchp or 0 );
+ }
+ else {
+ printf "
+Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%)
+Spam changes / total spam hits : %6d/%-6d (%7.3f%%)
+Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%)
+Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
+",
+ $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
+ $r->{spam_change}, $total_spam, ( $schp or 0 ),
+ $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
+ $r->{junk_change}, $total_junk, ( $jchp or 0 );
+ }
+ }
+ else {
+ print ",";
+ JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" );
+ }
+ }
+
+ if ($correlations) {
+
+ my $spam_related = GenRelated( $r->{symbols_met_spam}, $s );
+ my $junk_related = GenRelated( $r->{symbols_met_junk}, $s );
+ my $ham_related = GenRelated( $r->{symbols_met_ham}, $s );
+
+ if ( !$json ) {
+ print "Correlations report:\n";
+
+ while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
+ my $corr_prob = $r->{'hits'} / $total;
+ my $merged_hits = 0;
+ if ( $r->{symbols_met_spam}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_spam}->{$cs};
+ }
+ if ( $r->{symbols_met_junk}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_junk}->{$cs};
+ }
+ if ( $r->{symbols_met_ham}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_ham}->{$cs};
+ }
+
+ if ( $merged_hits > 0 ) {
+ printf "Probability of %s when %s fires: %.3f\n", $cs, $s,
+ ( ( $merged_hits / $total ) / $corr_prob );
+ }
+ }
+
+ print "Related symbols report:\n";
+ printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} );
+ printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} );
+ printf "Top related in ham:\n %s\n",
+ StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} );
+ }
+ else {
+ print ",";
+ print "\"correllations\":{";
+
+ my $has_comma_ = 0;
+ while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
+ if ($has_comma_) {
+ print ",";
+ }
+ else {
+ $has_comma_ = 1;
+ }
+ my $corr_prob = $hits / $total;
+ my $sym_prob = $r->{hits} / $total;
+ JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" );
+ }
+
+ print "}";
+ }
+ }
+
+ print "}" if $json;
+ }
+ else {
+ print "Symbol $s has not been met\n" if !$json;
+ }
+
+ print '-' x 80 . "\n" if !$json;
+ }
+ }
+}
+
+sub Summary() {
+ if ( !$json ) {
+ print "
+=== Summary ", '=' x 68, "
+Messages scanned: $total";
+ printf " [ %s / %s ]
+", $timeStamp{'start'}, $timeStamp{'end'}
+ if defined $timeStamp{'start'};
+ say '';
+ printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action;
+ say '';
+ printf "scan time min/avg/max = %.2f/%.2f/%.2f s
+", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000
+ if exists $scanTime{'min'};
+ say '=' x 80;
+ }
+ else {
+ JsonObjectElt( "total", $total, "%d" );
+ print ",";
+
+ if ( defined $timeStamp{'start'} ) {
+ JsonObjectElt( "start", $timeStamp{'start'} );
+ print ",";
+ }
+
+ if ( defined $timeStamp{'end'} ) {
+ JsonObjectElt( "end", $timeStamp{'end'} );
+ print ",";
+ }
+
+ print "\"actions\":{";
+
+ my $has_comma = 0;
+ foreach my $a ( sort keys %action ) {
+ if ($has_comma) {
+ print ",";
+ }
+ else {
+ $has_comma = 1;
+ }
+ JsonObjectElt( $a, $action{$a}, "%d" );
+ }
+ print "},";
+ }
+}
+
+sub ProcessRelated {
+ my ( $symbols, $target, $source ) = @_;
+
+ foreach my $s ( @{$symbols} ) {
+ $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
+
+ if ( $groups{$sym_name} ) {
+ $sym_name = $groups{$sym_name};
+ }
+
+ next if ( $source eq $sym_name );
+
+ next if IsIgnored($sym_name);
+
+ if ($2) {
+ $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0);
+
+ if ( abs($sym_score) < $diff_alpha ) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ( $sym_score >= 0 ) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
+
+ if ( exists( $target->{$sym_name} ) ) {
+ $target->{$sym_name}++;
+ }
+ else {
+ $target->{$sym_name} = 1;
+ }
+ }
+}
+
+sub ProcessLog {
+ my ( $ts_format, @line ) = &log_time_format($rspamd_log);
+
+ while () {
+ last if eof $rspamd_log;
+ $_ = (@line) ? shift @line : <$rspamd_log>;
+
+ if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) {
+ $enabled = 1;
+ }
+
+ next if !$enabled;
+
+ if (/^.*rspamd_task_write_log.*$/) {
+ &spinner;
+ my $ts;
+ if ( $ts_format eq 'syslog' ) {
+ $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] );
+ }
+ elsif ( $ts_format eq 'syslog5424' ) {
+ /^([0-9-]+)T([0-9:]+)/;
+ $ts = "$1 $2";
+ }
+ else {
+ $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ];
+ }
+
+ next if ( $ts lt $startTime );
+ next if ( defined $endTime && $ts gt $endTime );
+
+ if ( $_ !~
+ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms/ )
+ {
+ #print "BAD: $_\n";
+ next;
+ }
+
+ my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
+ my $scan_time = $5;
+ my $act = $1;
+ my $score = $2 * 1.0;
+ my $skip = 0;
+
+ foreach my $ex (@symbols_exclude) {
+ my @found = grep { /^$ex/ } @symbols;
+
+ if ( scalar(@found) > 0 ) {
+ $skip = 1;
+ last;
+ }
+ }
+
+ next if ( $skip != 0 );
+
+ if ( defined( $timeStamp{'end'} ) ) {
+ $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} );
+ }
+ else {
+ $timeStamp{'end'} = $ts;
+ }
+
+ if ( defined( $timeStamp{'start'} ) ) {
+ $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} );
+ }
+ else {
+ $timeStamp{'start'} = $ts;
+ }
+
+ $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
+ $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time );
+ $scanTime{'total'} += $scan_time;
+
+ $action{$act}++;
+ $total++;
+
+ if ( $score >= $reject_score ) {
+ $total_spam++;
+ }
+ elsif ( $score >= $junk_score ) {
+ $total_junk++;
+ }
+
+ my @sym_names;
+
+ foreach my $s (@symbols_search) {
+ my @selected = grep /$s/, @symbols;
+
+ if ( scalar(@selected) > 0 ) {
+
+ foreach my $sym (@selected) {
+ $sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
+ my $orig_name = $sym_name;
+
+ if ($2) {
+ $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0);
+
+ if ( abs($sym_score) < $diff_alpha ) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ( $sym_score >= 0 ) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
+
+ next if $orig_name !~ /^$s/;
+
+ if ( $groups{$s} ) {
+
+ # Replace with group
+ $sym_name = $groups{$s};
+ }
+
+ push @sym_names, $sym_name;
+
+ if ( !$sym_res{$sym_name} ) {
+ $sym_res{$sym_name} = {
+ hits => 0,
+ spam_hits => 0,
+ junk_hits => 0,
+ spam_change => 0,
+ junk_change => 0,
+ weight => 0,
+ corr => {},
+ symbols_met_spam => {},
+ symbols_met_ham => {},
+ symbols_met_junk => {},
+ };
+ }
+
+ my $r = $sym_res{$sym_name};
+
+ $r->{hits}++;
+ $r->{weight} += $sym_score;
+ my $is_spam = 0;
+ my $is_junk = 0;
+
+ if ( $score >= $reject_score ) {
+ $is_spam = 1;
+ $r->{spam_hits}++;
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name );
+ }
+ }
+ elsif ( $score >= $junk_score ) {
+ $is_junk = 1;
+ $r->{junk_hits}++;
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name );
+ }
+ }
+ else {
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name );
+ }
+ }
+
+ if ( $sym_score != 0 ) {
+ my $score_without = $score - $sym_score;
+
+ if ( $sym_score > 0 ) {
+ if ( $is_spam && $score_without < $reject_score ) {
+ $r->{spam_change}++;
+ }
+ if ( $is_junk && $score_without < $junk_score ) {
+ $r->{junk_change}++;
+ }
+ }
+ else {
+ if ( !$is_spam && $score_without >= $reject_score ) {
+ $r->{spam_change}++;
+ }
+ if ( !$is_junk && $score_without >= $junk_score ) {
+ $r->{junk_change}++;
+ }
+ }
+ }
+ } # End foreach symbols selected
+ }
+ }
+
+ if ($correlations) {
+ foreach my $sym (@sym_names) {
+ next if IsIgnored($sym);
+ my $r = $sym_res{$sym};
+
+ foreach my $corr_sym (@sym_names) {
+ if ( $corr_sym ne $sym ) {
+ if ( $r->{'corr'}->{$corr_sym} ) {
+ $r->{'corr'}->{$corr_sym}++;
+ }
+ else {
+ $r->{'corr'}->{$corr_sym} = 1;
+ }
+ }
+ }
+ } # End of correlations check
+ }
+ }
+ }
+}
+
+sub JsonObjectElt() {
+ my ( $k, $v ) = @_;
+ my $f = defined $_[2] ? $_[2] : '%s';
+
+ if ( $f eq "%s" ) {
+ $f = "\"%s\"";
+ }
+
+ printf "\"%s\":$f", $k, $v;
+}
+
+sub GetLogfilesList {
+ my ($dir) = @_;
+ opendir( my $fh, $dir ) or die $!;
+
+ my $pattern = join( '|', keys %decompressor );
+ my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
+
+ # Add unnumbered logs first
+ my @logs =
+ grep { -f "$dir/$_" && !/$re/ } readdir($fh);
+
+ # Add numbered logs
+ rewinddir($fh);
+ push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir($fh) ) ) );
+
+ closedir($fh);
+
+ # Select required logs and revers their order
+ @logs =
+ reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
+
+ # Loop through array printing out filenames
+ print { interactive(*STDERR) } "\nLog files to process:\n";
+ foreach my $file (@logs) {
+ print { interactive(*STDERR) } " $file\n";
+ }
+ print { interactive(*STDERR) } "\n";
+
+ return @logs;
+}
+
+sub log_time_format {
+ my $fh = shift;
+ my ( $format, $line );
+ while (<$fh>) {
+ $line = $_;
+
+ # 2017-08-08 00:00:01 #66984(
+ # 2017-08-08 00:00:01.001 #66984(
+ if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3,5})? #\d+\(/) {
+ $format = 'rspamd';
+ last;
+ }
+
+ # Aug 8 00:02:50 #66986(
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) {
+ $format = 'syslog';
+ last;
+ }
+
+ # Aug 8 00:02:50 hostname rspamd[66986]
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) {
+ $format = 'syslog';
+ last;
+ }
+
+ # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968]
+ elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) {
+ $format = 'syslog5424';
+ last;
+ }
+
+ # Skip newsyslog messages
+ # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) {
+ next;
+ }
+
+ # Skip journalctl messages
+ # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. --
+ elsif (
+/^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/
+ )
+ {
+ next;
+ }
+ else {
+ print "Unknown log format\n";
+ exit 1;
+ }
+ }
+ return ( $format, $line );
+}
+
+sub normalized_time {
+ return
+ if !defined( $_ = shift );
+
+ /^\d\d(?::\d\d){0,2}$/
+ ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_
+ : $_;
+}
+
+sub numeric {
+ $a =~ /\.(\d+)\./;
+ my $a_num = $1;
+ $b =~ /\.(\d+)\./;
+ my $b_num = $1;
+
+ $a_num <=> $b_num;
+}
+
+sub spinner {
+ my @spinner = qw{/ - \ |};
+ return
+ if ( ( time - $spinner_update_time ) < 1 );
+ $spinner_update_time = time;
+ printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ];
+ select()->flush();
+}
+
+# Convert syslog timestamp to "ISO 8601 like" format
+# using current year as syslog does not record the year (nor the timezone)
+# or the last year if the guessed time is in the future.
+sub syslog2iso {
+ my %month_map;
+ @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11;
+
+ my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x;
+ my $epoch =
+ timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] );
+ sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t;
+}
+
+### Imported from IO::Interactive 1.022 Perl module
+sub is_interactive {
+ ## no critic (ProhibitInteractiveTest)
+
+ my ($out_handle) = ( @_, select ); # Default to default output handle
+
+ # Not interactive if output is not to terminal...
+ return 0 if not -t $out_handle;
+
+ # If *ARGV is opened, we're interactive if...
+ if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to
+
+ # ...it's currently opened to the magic '-' file
+ return -t *STDIN if defined $ARGV && $ARGV eq '-';
+
+ # ...it's at end-of-file and the next file is the magic '-' file
+ return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV;
+
+ # ...it's directly attached to the terminal
+ return -t *ARGV;
+ }
+
+ # If *ARGV isn't opened, it will be interactive if *STDIN is attached
+ # to a terminal.
+ else {
+ return -t *STDIN;
+ }
+}
+
+### Imported from IO::Interactive 1.022 Perl module
+local ( *DEV_NULL, *DEV_NULL2 );
+my $dev_null;
+
+BEGIN {
+ pipe *DEV_NULL, *DEV_NULL2
+ or die "Internal error: can't create null filehandle";
+ $dev_null = \*DEV_NULL;
+}
+
+### Imported from IO::Interactive 1.022 Perl module
+sub interactive {
+ my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT
+ return &is_interactive ? $out_handle : $dev_null;
+}
+
+__END__
+
+=head1 NAME
+
+rspamd_stats - analyze Rspamd rules by parsing log files
+
+=head1 SYNOPSIS
+
+rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file]
+
+ Options:
+ --log=file log file or directory to read (stdin by default)
+ --reject-score=score set reject threshold (15 by default)
+ --junk-score=score set junk score (6.0 by default)
+ --symbol=sym check specified symbol (perl regexps, '.*' by default)
+ --alpha-score=score set ignore score for symbols (0.1 by default)
+ --correlations enable correlations report
+ --nrelated=integer show that amount of related symbols (10 by default)
+ --search-pattern do not process input unless the desired pattern is found
+ --start starting time (oldest) for log parsing
+ --end ending time (newest) for log parsing
+ --num-logs=integer number of recent logfiles to analyze (all files in the directory by default)
+ --exclude-logs=integer number of latest logs to exclude (0 by default)
+ --json print json output instead of human readable
+ --help brief help message
+ --mult=sym=number multiply symbol score
+ --man full documentation
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--log>
+
+Specifies log file or directory to read data from. If a directory is specified B<rspamd_stats> analyses files in the
+directory including known compressed file types. Number of log files can be limited using B<--num-logs> and
+B<--exclude-logs> options. This assumes that files in the log directory have B<newsyslog(8)>- or B<logrotate(8)>-like
+name format with numeric indexes. Files without indexes (generally it is merely one file) are considered the most
+recent and files with lower indexes are considered newer.
+
+=item B<--reject-score>
+
+Specifies the reject (spam) threshold.
+
+=item B<--junk-score>
+
+Specifies the junk (add header or rewrite subject) threshold.
+
+=item B<--alpha-score>
+
+Specifies the minimum score for a symbol to be considered by this script.
+
+=item B<--symbol>
+
+Add symbol or pattern (pcre format) to analyze.
+
+=item B<--num-logs>
+
+If set, limits number of analyzed logfiles in the directory to the specified value.
+
+=item B<--exclude-logs>
+
+Number of latest logs to exclude (0 by default).
+
+=item B<--correlations>
+
+Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations
+between symbols.
+
+=item B<--search-pattern>
+
+Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position.
+
+=item B<--exclude>
+
+Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple
+symbols.
+
+=item B<--start>
+
+Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used
+with B<--end> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you
+supply the time.
+
+=item B<--end>
+
+Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used
+with B<--start> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you
+supply the time.
+
+=item B<--mult=symbol=number>
+
+Multiplies score for the named symbol by the provided multiplier.
+
+=item B<--help>
+
+Print a brief help message and exits.
+
+=item B<--man>
+
+Prints the manual page and exits.
+
+=back
+
+=head1 DESCRIPTION
+
+B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols:
+
+ Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%)
+ Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%)
+ Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%)
+ Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%)
+ Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102%
+ Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540%
+
+Where there are the following attributes:
+
+=over 4
+
+=item *
+
+B<Weight>: average score for a symbols
+
+=item *
+
+B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages
+
+=item *
+
+B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right):
+
+=over 4
+
+=item 1.
+
+B<total symbol hits>: number of messages that has this symbol and are B<HAM>
+
+=item 2.
+
+B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count
+
+=item 3.
+
+B<total ham hits>: overall number of B<HAM> messages
+
+=item 4.
+
+B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total
+number of B<HAM> messages.
+
+=back
+
+=item *
+
+B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class.
+
+=item *
+
+B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class.
+
+=item *
+
+B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight.
+
+=item *
+
+B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight.
+
+=back
+
+=cut
diff --git a/utils/sa_trivial_convert.lua b/utils/sa_trivial_convert.lua
new file mode 100644
index 0000000..2ea53be
--- /dev/null
+++ b/utils/sa_trivial_convert.lua
@@ -0,0 +1,443 @@
+local fun = require "fun"
+local rspamd_logger = require "rspamd_logger"
+local util = require "rspamd_util"
+local lua_util = require "lua_util"
+local rspamd_regexp = require "rspamd_regexp"
+local ucl = require "ucl"
+
+local complicated = {}
+local rules = {}
+local scores = {}
+
+local function words_to_re(words, start)
+ return table.concat(fun.totable(fun.drop_n(start, words)), " ");
+end
+
+local function split(str, delim)
+ local result = {}
+
+ if not delim then
+ delim = '[^%s]+'
+ end
+
+ for token in string.gmatch(str, delim) do
+ table.insert(result, token)
+ end
+
+ return result
+end
+
+local function handle_header_def(hline, cur_rule)
+ --Now check for modifiers inside header's name
+ local hdrs = split(hline, '[^|]+')
+ local hdr_params = {}
+ local cur_param = {}
+ -- Check if an re is an ordinary re
+ local ordinary = true
+
+ for _,h in ipairs(hdrs) do
+ if h == 'ALL' or h == 'ALL:raw' then
+ ordinary = false
+ else
+ local args = split(h, '[^:]+')
+ cur_param['strong'] = false
+ cur_param['raw'] = false
+ cur_param['header'] = args[1]
+
+ if args[2] then
+ -- We have some ops that are required for the header, so it's not ordinary
+ ordinary = false
+ end
+
+ fun.each(function(func)
+ if func == 'addr' then
+ cur_param['function'] = function(str)
+ local addr_parsed = util.parse_mail_address(str)
+ local ret = {}
+ if addr_parsed then
+ for _,elt in ipairs(addr_parsed) do
+ if elt['addr'] then
+ table.insert(ret, elt['addr'])
+ end
+ end
+ end
+
+ return ret
+ end
+ elseif func == 'name' then
+ cur_param['function'] = function(str)
+ local addr_parsed = util.parse_mail_address(str)
+ local ret = {}
+ if addr_parsed then
+ for _,elt in ipairs(addr_parsed) do
+ if elt['name'] then
+ table.insert(ret, elt['name'])
+ end
+ end
+ end
+
+ return ret
+ end
+ elseif func == 'raw' then
+ cur_param['raw'] = true
+ elseif func == 'case' then
+ cur_param['strong'] = true
+ else
+ rspamd_logger.warnx(rspamd_config, 'Function %1 is not supported in %2',
+ func, cur_rule['symbol'])
+ end
+ end, fun.tail(args))
+
+ -- Some header rules require splitting to check of multiple headers
+ if cur_param['header'] == 'MESSAGEID' then
+ -- Special case for spamassassin
+ ordinary = false
+ elseif cur_param['header'] == 'ToCc' then
+ ordinary = false
+ else
+ table.insert(hdr_params, cur_param)
+ end
+ end
+
+ cur_rule['ordinary'] = ordinary and #hdr_params <= 1
+ cur_rule['header'] = hdr_params
+ end
+end
+
+local function process_sa_conf(f)
+ local cur_rule = {}
+ local valid_rule = false
+
+ local function insert_cur_rule()
+ if not rules[cur_rule.type] then
+ rules[cur_rule.type] = {}
+ end
+
+ local target = rules[cur_rule.type]
+
+ if cur_rule.type == 'header' then
+ if not cur_rule.header[1].header then
+ rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+ return
+ end
+ if not target[cur_rule.header[1].header] then
+ target[cur_rule.header[1].header] = {}
+ end
+ target = target[cur_rule.header[1].header]
+ end
+
+ if not cur_rule['symbol'] then
+ rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule)
+ return
+ end
+ target[cur_rule['symbol']] = cur_rule
+ cur_rule = {}
+ valid_rule = false
+ end
+
+ local function parse_score(words)
+ if #words == 3 then
+ -- score rule <x>
+ return tonumber(words[3])
+ elseif #words == 6 then
+ -- score rule <x1> <x2> <x3> <x4>
+ -- we assume here that bayes and network are enabled and select <x4>
+ return tonumber(words[6])
+ else
+ rspamd_logger.errx(rspamd_config, 'invalid score for %1', words[2])
+ end
+
+ return 0
+ end
+
+ local skip_to_endif = false
+ local if_nested = 0
+ for l in f:lines() do
+ (function ()
+ l = lua_util.rspamd_str_trim(l)
+ -- Replace bla=~/re/ with bla =~ /re/ (#2372)
+ l = l:gsub('([^%s])%s*([=!]~)%s*([^%s])', '%1 %2 %3')
+
+ if string.len(l) == 0 or string.sub(l, 1, 1) == '#' then
+ return
+ end
+
+ -- Unbalanced if/endif
+ if if_nested < 0 then if_nested = 0 end
+ if skip_to_endif then
+ if string.match(l, '^endif') then
+ if_nested = if_nested - 1
+
+ if if_nested == 0 then
+ skip_to_endif = false
+ end
+ elseif string.match(l, '^if') then
+ if_nested = if_nested + 1
+ elseif string.match(l, '^else') then
+ -- Else counterpart for if
+ skip_to_endif = false
+ end
+ table.insert(complicated, l)
+ return
+ else
+ if string.match(l, '^ifplugin') then
+ skip_to_endif = true
+ if_nested = if_nested + 1
+ table.insert(complicated, l)
+ elseif string.match(l, '^if !plugin%(') then
+ skip_to_endif = true
+ if_nested = if_nested + 1
+ table.insert(complicated, l)
+ elseif string.match(l, '^if') then
+ -- Unknown if
+ skip_to_endif = true
+ if_nested = if_nested + 1
+ table.insert(complicated, l)
+ elseif string.match(l, '^else') then
+ -- Else counterpart for if
+ skip_to_endif = true
+ table.insert(complicated, l)
+ elseif string.match(l, '^endif') then
+ if_nested = if_nested - 1
+ table.insert(complicated, l)
+ end
+ end
+
+ -- Skip comments
+ local words = fun.totable(fun.take_while(
+ function(w) return string.sub(w, 1, 1) ~= '#' end,
+ fun.filter(function(w)
+ return w ~= "" end,
+ fun.iter(split(l)))))
+
+ if words[1] == "header" then
+ -- header SYMBOL Header ~= /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+ if words[4] and (words[4] == '=~' or words[4] == '!~') then
+ cur_rule['type'] = 'header'
+ cur_rule['symbol'] = words[2]
+
+ if words[4] == '!~' then
+ table.insert(complicated, l)
+ return
+ end
+
+ cur_rule['re_expr'] = words_to_re(words, 4)
+ local unset_comp = string.find(cur_rule['re_expr'], '%s+%[if%-unset:')
+ if unset_comp then
+ table.insert(complicated, l)
+ return
+ end
+
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+
+ if not cur_rule['re'] then
+ rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%1' for %2",
+ cur_rule['re_expr'], cur_rule['symbol'])
+ table.insert(complicated, l)
+ return
+ else
+ handle_header_def(words[3], cur_rule)
+ if not cur_rule['ordinary'] then
+ table.insert(complicated, l)
+ return
+ end
+ end
+
+ valid_rule = true
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "body" then
+ -- body SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+
+ cur_rule['symbol'] = words[2]
+ if words[3] and (string.sub(words[3], 1, 1) == '/'
+ or string.sub(words[3], 1, 1) == 'm') then
+ cur_rule['type'] = 'sabody'
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ if cur_rule['re'] then
+
+ valid_rule = true
+ end
+ else
+ -- might be function
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "rawbody" then
+ -- body SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+
+ cur_rule['symbol'] = words[2]
+ if words[3] and (string.sub(words[3], 1, 1) == '/'
+ or string.sub(words[3], 1, 1) == 'm') then
+ cur_rule['type'] = 'sarawbody'
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ if cur_rule['re'] then
+ valid_rule = true
+ end
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "full" then
+ -- body SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+
+ cur_rule['symbol'] = words[2]
+
+ if words[3] and (string.sub(words[3], 1, 1) == '/'
+ or string.sub(words[3], 1, 1) == 'm') then
+ cur_rule['type'] = 'message'
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ cur_rule['raw'] = true
+ if cur_rule['re'] then
+ valid_rule = true
+ end
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "uri" then
+ -- uri SYMBOL /regexp/
+ if valid_rule then
+ insert_cur_rule()
+ end
+ cur_rule['type'] = 'uri'
+ cur_rule['symbol'] = words[2]
+ cur_rule['re_expr'] = words_to_re(words, 2)
+ cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr'])
+ if cur_rule['re'] and cur_rule['symbol'] then
+ valid_rule = true
+ else
+ table.insert(complicated, l)
+ return
+ end
+ elseif words[1] == "meta" then
+ -- meta SYMBOL expression
+ if valid_rule then
+ insert_cur_rule()
+ end
+ table.insert(complicated, l)
+ return
+ elseif words[1] == "describe" and valid_rule then
+ cur_rule['description'] = words_to_re(words, 2)
+ elseif words[1] == "score" then
+ scores[words[2]] = parse_score(words)
+ else
+ table.insert(complicated, l)
+ return
+ end
+ end)()
+ end
+ if valid_rule then
+ insert_cur_rule()
+ end
+end
+
+for _,matched in ipairs(arg) do
+ local f = io.open(matched, "r")
+ if f then
+ rspamd_logger.messagex(rspamd_config, 'loading SA rules from %s', matched)
+ process_sa_conf(f)
+ else
+ rspamd_logger.errx(rspamd_config, "cannot open %1", matched)
+ end
+end
+
+local multimap_conf = {}
+
+local function handle_rule(what, syms, hdr)
+ local mtype
+ local filter
+ local fname
+ local header
+ local sym = what:upper()
+ if what == 'sabody' then
+ mtype = 'content'
+ fname = 'body_re.map'
+ filter = 'oneline'
+ elseif what == 'sarawbody' then
+ fname = 'raw_body_re.map'
+ mtype = 'content'
+ filter = 'rawtext'
+ elseif what == 'full' then
+ fname = 'full_re.map'
+ mtype = 'content'
+ filter = 'full'
+ elseif what == 'uri' then
+ fname = 'uri_re.map'
+ mtype = 'url'
+ filter = 'full'
+ elseif what == 'header' then
+ fname = ('hdr_' .. hdr .. '_re.map'):lower()
+ mtype = 'header'
+ header = hdr
+ sym = sym .. '_' .. hdr:upper()
+ else
+ rspamd_logger.errx('unknown type: %s', what)
+ return
+ end
+ local conf = {
+ type = mtype,
+ filter = filter,
+ symbol = 'SA_MAP_AUTO_' .. sym,
+ regexp = true,
+ map = fname,
+ header = header,
+ symbols = {}
+ }
+ local re_file = io.open(fname, 'w')
+
+ for k,r in pairs(syms) do
+ local score = 0.0
+ if scores[k] then
+ score = scores[k]
+ end
+ re_file:write(string.format('/%s/ %s:%f\n', tostring(r.re), k, score))
+ table.insert(conf.symbols, k)
+ end
+
+ re_file:close()
+
+ multimap_conf[sym:lower()] = conf
+ rspamd_logger.messagex('stored %s regexp in %s', sym:lower(), fname)
+end
+
+for k,v in pairs(rules) do
+ if k == 'header' then
+ for h,r in pairs(v) do
+ handle_rule(k, r, h)
+ end
+ else
+ handle_rule(k, v)
+ end
+end
+
+local out = ucl.to_format(multimap_conf, 'ucl')
+local mmap_conf = io.open('auto_multimap.conf', 'w')
+mmap_conf:write(out)
+mmap_conf:close()
+rspamd_logger.messagex('stored multimap conf in %s', 'auto_multimap.conf')
+
+local sa_remain = io.open('auto_sa.conf', 'w')
+fun.each(function(l)
+ sa_remain:write(l)
+ sa_remain:write('\n')
+end, fun.filter(function(l) return not string.match(l, '^%s+$') end, complicated))
+sa_remain:close()
+rspamd_logger.messagex('stored sa remains conf in %s', 'auto_sa.conf')