summaryrefslogtreecommitdiffstats
path: root/tests/mail_to_db.pl
diff options
context:
space:
mode:
Diffstat (limited to 'tests/mail_to_db.pl')
-rwxr-xr-xtests/mail_to_db.pl624
1 files changed, 624 insertions, 0 deletions
diff --git a/tests/mail_to_db.pl b/tests/mail_to_db.pl
new file mode 100755
index 00000000..b160582d
--- /dev/null
+++ b/tests/mail_to_db.pl
@@ -0,0 +1,624 @@
+#!/usr/bin/env perl
+# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
+# This file is public domain and comes with NO WARRANTY of any kind
+#
+# This program is brought to you by Janne-Petteri Koilo with the
+# administration of Michael Widenius.
+#
+# Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith
+# 15.12.2000
+#
+# This program takes your mails and puts them into your database. It ignores
+# messages with the same from, date and message text.
+# You can use mail-files that are compressed or gzipped and ends with
+# -.gz or -.Z.
+
+use DBI;
+use Getopt::Long;
+
+$| = 1;
+$VER = "3.0";
+
+$opt_help = 0;
+$opt_version = 0;
+$opt_debug = 0;
+$opt_host = undef();
+$opt_port = undef();
+$opt_socket = undef();
+$opt_db = "mail";
+$opt_user = undef();
+$opt_password = undef();
+$opt_max_mail_size = 65536;
+$opt_create = 0;
+$opt_test = 0;
+$opt_no_path = 0;
+$opt_stop_on_error = 0;
+$opt_stdin = 0;
+
+my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big,
+ $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted);
+
+$mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded =
+$mail_duplicates = $mail_no_subject_f = $mail_inserted = 0;
+$mail_fixed=0;
+
+#
+# Remove the following message-ends from message
+#
+@remove_tail= (
+"\n-*\nSend a mail to .*\n.*\n.*\$",
+"\n-*\nPlease check .*\n.*\n\nTo unsubscribe, .*\n.*\n.*\nIf you have a broken.*\n.*\n.*\$",
+"\n-*\nPlease check .*\n(.*\n){1,3}\nTo unsubscribe.*\n.*\n.*\$",
+"\n-*\nPlease check .*\n.*\n\nTo unsubscribe.*\n.*\$",
+"\n-*\nTo request this thread.*\nTo unsubscribe.*\n.*\.*\n.*\$",
+"\n -*\n.*Send a mail to.*\n.*\n.*unsubscribe.*\$",
+"\n-*\nTo request this thread.*\n\nTo unsubscribe.*\n.*\$"
+);
+
+# Generate regexp to remove tails where the unsubscribed is quoted
+{
+ my (@tmp, $tail);
+ @tmp=();
+ foreach $tail (@remove_tail)
+ {
+ $tail =~ s/\n/\n[> ]*/g;
+ push(@tmp, $tail);
+ }
+ push @remove_tail,@tmp;
+}
+
+my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
+ 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
+ 'Nov' => 11, 'Dec' => 12);
+
+$progname = $0;
+$progname =~ s/.*[\/]//;
+
+main();
+
+####
+#### main sub routine
+####
+
+sub main
+{
+ my ($connect_arg, @args, $ignored, @defops, $i);
+
+ if (defined(my_which("my_print_defaults")))
+ {
+ @defops = `my_print_defaults mail_to_db`;
+ chop @defops;
+ splice @ARGV, 0, 0, @defops;
+ }
+ else
+ {
+ print "WARNING: No command 'my_print_defaults' found; unable to read\n";
+ print "the my.cnf file. This command is available from the latest MySQL\n";
+ print "distribution.\n";
+ }
+ GetOptions("help","version","host=s","port=i","socket=s","db=s",
+ "user=s","password=s","max_mail_size=i","create","test",
+ "no_path","debug","stop_on_error","stdin")
+ || die "Wrong option! See $progname --help\n";
+
+ usage($VER) if ($opt_help || $opt_version ||
+ (!$ARGV[0] && !$opt_create && !$opt_stdin));
+
+ # Check that the given inbox files exist and are regular files
+ for ($i = 0; ! $opt_stdin && defined($ARGV[$i]); $i++)
+ {
+ die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]);
+ }
+
+ $connect_arg = "DBI:MariaDB:";
+ push @args, "database=$opt_db" if defined($opt_db);
+ push @args, "host=$opt_host" if defined($opt_host);
+ push @args, "port=$opt_port" if defined($opt_port);
+ push @args, "mariadb_socket=$opt_socket" if defined($opt_socket);
+ push @args, "mariadb_read_default_group=mail_to_db";
+ $connect_arg .= join ';', @args;
+ $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password,
+ { PrintError => 0})
+ || die "Couldn't connect: $DBI::errstr\n";
+
+ die "You must specify the database; use --db=" if (!defined($opt_db));
+
+ create_table($dbh) if ($opt_create);
+
+ if ($opt_stdin)
+ {
+ open(FILE, "-");
+ process_mail_file($dbh, "READ-FROM-STDIN");
+ }
+ else
+ {
+ foreach (@ARGV)
+ {
+ # Check if the file is compressed
+ if (/^(.*)\.(gz|Z)$/)
+ {
+ open(FILE, "zcat $_ |");
+ process_mail_file($dbh, $1);
+ }
+ else
+ {
+ open(FILE, $_);
+ process_mail_file($dbh, $_);
+ }
+ }
+ }
+ $dbh->disconnect if (!$opt_test);
+
+ $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f +
+ $mail_too_big + $mail_duplicates + $mail_fixed);
+ print "################################ Mail Report #################################\n\n";
+ print "Mails inserted:\t\t\t\t\t$mail_inserted\n";
+ print "--------------- ";
+ print "=" . "=" x length("$mail_inserted") . "=\n\n";
+ if ($ignored)
+ {
+ print "Ignored mails\n";
+ print "-------------\n";
+ if ($mail_no_from_f)
+ {
+ print "Reason: mail without \"From:\" -field:\t\t$mail_no_from_f\n";
+ }
+ else
+ {
+ print "";
+ }
+ if ($mail_no_txt_f)
+ {
+ print "Reason: mail without message:\t\t\t$mail_no_txt_f\n";
+ }
+ else
+ {
+ print "";
+ }
+ if ($mail_no_subject_f)
+ {
+ print "Reason: mail without subject:\t\t\t$mail_no_subject_f\n";
+ }
+ else
+ {
+ print "";
+ }
+ if ($mail_too_big)
+ {
+ print "Reason: mail too big, over $opt_max_mail_size bytes:\t\t";
+ print $mail_too_big;
+ print " (see --max_mail_size=#)\n";
+ }
+ else
+ {
+ print "";
+ }
+ if ($mail_duplicates)
+ {
+ print "Reason: duplicate mail, or in db already:\t$mail_duplicates\n";
+ }
+ else
+ {
+ print "";
+ }
+ if ($mail_fixed)
+ {
+ print "Reason: mail was an unsubscribe - mail:\t\t$mail_fixed\n";
+ }
+ else
+ {
+ print "";
+ }
+ print " ";
+ print "=" . "=" x length("$ignored") . "=\n";
+ print "Total number of ignored mails:\t\t\t$ignored\n\n";
+ }
+ print "Total number of mails:\t\t\t\t";
+ print $mail_inserted + $ignored;
+ print " (OK: ";
+ print sprintf("%.1f", ($mail_inserted + $ignored) ? (($mail_inserted / ($mail_inserted+$ignored)) * 100) : 0.0);
+ print "% Ignored: ";
+ print sprintf("%.1f", ($mail_inserted + $ignored) ? (($ignored / ($mail_inserted + $ignored)) * 100) : 0);
+ print "%)\n";
+ print "################################ End Report ##################################\n";
+ exit(0);
+}
+
+####
+#### table creation
+####
+
+sub create_table
+{
+ my ($dbh)= @_;
+ my ($sth, $query);
+
+ $query= <<EOF;
+CREATE TABLE my_mail
+(
+ mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment,
+ message_id VARCHAR(255),
+ in_reply_to VARCHAR(255),
+ date DATETIME NOT NULL,
+ time_zone VARCHAR(20),
+ mail_from VARCHAR(120) NOT NULL,
+ reply VARCHAR(120),
+ mail_to TEXT,
+ cc TEXT,
+ sbj VARCHAR(200),
+ txt MEDIUMTEXT NOT NULL,
+ file VARCHAR(64) NOT NULL,
+ hash INTEGER NOT NULL,
+ KEY (mail_id),
+ KEY (message_id),
+ KEY (in_reply_to),
+ PRIMARY KEY (mail_from, date, hash))
+ ENGINE=MyISAM COMMENT=''
+EOF
+ $sth = $dbh->prepare($query) or die $DBI::errstr;
+ $sth->execute() or die "Couldn't create table: $DBI::errstr\n";
+}
+
+####
+#### inbox processing. Can be either a real file, or standard input.
+####
+
+sub process_mail_file
+{
+ my ($dbh, $file_name) = @_;
+ my (%values, $type, $check);
+
+ $file_name =~ s/.*[\/]// if ($opt_no_path);
+
+ %values = ();
+ $type = "";
+ $check = 0;
+ while (<FILE>)
+ {
+ chop;
+ chop if (substr($_, -1, 1) eq "\r");
+ if ($type ne "message")
+ {
+ if (/^Reply-To:\s*(.*)/i)
+ {
+ $type = "reply";
+ $values{$type} = $1;
+ }
+ elsif (/^From: (.*)/i)
+ {
+ $type = "from";
+ $values{$type} = $1;
+ }
+ elsif (/^To: (.*)/i)
+ {
+ $type = "to";
+ $values{$type} = $1;
+ }
+ elsif (/^Cc: (.*)/i)
+ {
+ $type = "cc";
+ $values{$type} = $1;
+ }
+ elsif (/^Subject: (.*)/i)
+ {
+ $type = "subject";
+ $values{$type} = $1;
+ }
+ elsif (/^Message-Id:\s*(.*)/i)
+ {
+ $type = "message_id";
+ s/^\s*(<.*>)\s*/$1/;
+ $values{$type} = $1;
+ }
+ elsif (/^In-Reply-To:\s*(.*)/i)
+ {
+ $type = "in_reply_to";
+ s/^\s*(<.*>)\s*/$1/;
+ $values{$type} = $1;
+ }
+ elsif (/^Date: (.*)/i)
+ {
+ date_parser($1, \%values, $file_name);
+ $type = "rubbish";
+ }
+ # Catch those fields that we don't or can't handle (yet)
+ elsif (/^[\w\W-]+:/)
+ {
+ $type = "rubbish";
+ }
+ elsif ($_ eq "")
+ {
+ $type = "message";
+ $values{$type} = "";
+ }
+ else
+ {
+ s/^\s*/ /;
+ if ($type eq 'message_id' || $type eq 'in_reply_to')
+ {
+ s/^\s*(<.*>)\s*/$1/;
+ }
+ $values{$type} .= $_;
+ }
+ }
+ elsif ($check != 0 && $_ ne "") # in case of forwarded messages
+ {
+ $values{$type} .= "\n" . $_;
+ $check--;
+ }
+ elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d/ ||
+ /^From .* \d\d\d\d\s\d\d:\d\d:\d\d/)
+ {
+ $values{'hash'} = checksum("$values{'message'}");
+ update_table($dbh, $file_name, \%values);
+ %values = ();
+ $type = "";
+ $check = 0;
+ }
+ elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
+ {
+ $values{$type} .= "\n" . $_;
+ $check++;
+ $mail_forwarded++;
+ }
+ else
+ {
+ $values{$type} .= "\n" . $_;
+ }
+ }
+ if (defined($values{'message'}))
+ {
+ $values{'hash'} = checksum("$values{'message'}");
+ update_table($dbh, $file_name, \%values);
+ }
+}
+
+####
+#### get date and timezone
+####
+
+sub date_parser
+{
+ my ($date_raw, $values, $file_name, $tmp) = @_;
+
+ # If you ever need to change this test, be especially careful with
+ # the timezone; it may be just a number (-0600), or just a name (EET), or
+ # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT.
+ # You probably should use a 'greedy' regexp in the end
+ $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/;
+
+ if (!defined($1) || !defined($2) || !defined($3) || !defined($4) ||
+ !defined($months{$2}))
+ {
+ if ($opt_debug || $opt_stop_on_error)
+ {
+ print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n";
+ print "months{2}: $months{$2}\n";
+ print "date_raw: $date_raw\n";
+ print "Inbox filename: $file_name\n";
+ }
+ exit(1) if ($opt_stop_on_error);
+ $values->{'date'} = "";
+ $values->{'time_zone'} = "";
+ return;
+ }
+ $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4";
+ $tmp.= defined($5) ? $5 : ":00";
+ $values->{'date'} = $tmp;
+ print "INSERTING DATE: $tmp\n" if ($opt_debug);
+ $values->{'time_zone'} = $6;
+}
+
+####
+#### Insert to table
+####
+
+sub update_table
+{
+ my($dbh, $file_name, $values) = @_;
+ my($q, $tail, $message);
+
+ if (!defined($values->{'subject'}) || !defined($values->{'to'}))
+ {
+ $mail_no_subject_f++;
+ return; # Ignore these
+ }
+ $message = $values->{'message'};
+ $message =~ s/^\s*//; # removes whitespaces from the beginning
+
+ restart:
+ $message =~ s/[\s\n>]*$//; # removes whitespaces and '>' from the end
+ $values->{'message'} = $message;
+ foreach $tail (@remove_tail)
+ {
+ $message =~ s/$tail//;
+ }
+ if ($message ne $values->{'message'})
+ {
+ $message =~ s/\s*$//; # removes whitespaces from the end
+ $mail_fixed++;
+ goto restart; # Some mails may have duplicated messages
+ }
+
+ $q = "INSERT INTO my_mail (";
+ $q.= "mail_id,";
+ $q.= "message_id,";
+ $q.= "in_reply_to,";
+ $q.= "date,";
+ $q.= "time_zone,";
+ $q.= "mail_from,";
+ $q.= "reply,";
+ $q.= "mail_to,";
+ $q.= "cc,";
+ $q.= "sbj,";
+ $q.= "txt,";
+ $q.= "file,";
+ $q.= "hash";
+ $q.= ") VALUES (";
+ $q.= "NULL,";
+ $q.= (defined($values->{'message_id'}) ?
+ $dbh->quote($values->{'message_id'}) : "NULL");
+ $q.= ",";
+ $q.= (defined($values->{'in_reply_to'}) ?
+ $dbh->quote($values->{'in_reply_to'}) : "NULL");
+ $q.= ",";
+ $q.= "'" . $values->{'date'} . "',";
+ $q.= (defined($values->{'time_zone'}) ?
+ $dbh->quote($values->{'time_zone'}) : "NULL");
+ $q.= ",";
+ $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL";
+ $q.= ",";
+ $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL";
+ $q.= ",";
+ $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL";
+ $q.= ",";
+ $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL";
+ $q.= ",";
+ $q.= $dbh->quote($values->{'subject'});
+ $q.= ",";
+ $q.= $dbh->quote($message);
+ $q.= ",";
+ $q.= $dbh->quote($file_name);
+ $q.= ",";
+ $q.= "'" . $values->{'hash'} . "'";
+ $q.= ")";
+
+ # Don't insert mails bigger than $opt_max_mail_size
+ if (length($message) > $opt_max_mail_size)
+ {
+ $mail_too_big++;
+ }
+ # Don't insert mails without 'From' field
+ elsif (!defined($values->{'from'}) || $values->{'from'} eq "")
+ {
+ $mail_no_from_f++;
+ }
+ elsif ($opt_test)
+ {
+ print "$q\n";
+ $mail_inserted++;
+ }
+ # Don't insert mails without the 'message'
+ elsif ($message eq "")
+ {
+ $mail_no_txt_f++;
+ }
+ elsif ($dbh->do($q))
+ {
+ $mail_inserted++;
+ }
+ # This should never happen. This means that the above q failed,
+ # but it wasn't because of a duplicate mail entry
+ elsif (!($DBI::errstr =~ /Duplicate entry /))
+ {
+ die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n";
+ }
+ else
+ {
+ $mail_duplicates++;
+ print "Duplicate mail: query: $q\n" if ($opt_debug);
+ }
+ $q = "";
+}
+
+####
+#### In case you have two identical messages we wanted to identify them
+#### and remove additionals; We do this by calculating a hash number of the
+#### message and ignoring messages with the same from, date and hash.
+#### This function calculates a simple 32 bit hash value for the message.
+####
+
+sub checksum
+{
+ my ($txt)= @_;
+ my ($crc, $i, $count);
+ $count = length($txt);
+ for ($crc = $i = 0; $i < $count ; $i++)
+ {
+ $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
+ (($crc & (1 << 30)) ? 1 : 0);
+ $crc &= ((1 << 31) -1);
+ }
+ return $crc;
+}
+
+####
+#### my_which is used, because we can't assume that every system has the
+#### which -command. my_which can take only one argument at a time.
+#### Return values: requested system command with the first found path,
+#### or undefined, if not found.
+####
+
+sub my_which
+{
+ my ($command) = @_;
+ my (@paths, $path);
+
+ return $command if (-f $command && -x $command);
+ @paths = split(':', $ENV{'PATH'});
+ foreach $path (@paths)
+ {
+ $path = "." if ($path eq "");
+ $path .= "/$command";
+ return $path if (-f $path && -x $path);
+ }
+ return undef();
+}
+
+####
+#### usage and version
+####
+
+sub usage
+{
+ my ($VER)= @_;
+
+ if ($opt_version)
+ {
+ print "$progname version $VER\n";
+ }
+ else
+ {
+ print <<EOF;
+$progname version $VER
+
+Description: Insert mails from inbox file(s) into a table. This program
+can read group [mail_to_db] from the my.cnf file. You may want to have db
+and table set there at least.
+
+Usage: $progname [options] file1 [file2 file3 ...]
+or: $progname [options] --create [file1 file2...]
+or: cat inbox | $progname [options] --stdin
+
+The last example can be used to read mails from standard input and can
+useful when inserting mails to database via a program 'on-the-fly'.
+The filename will be 'READ-FROM-STDIN' in this case.
+
+Options:
+--help Show this help and exit.
+--version Show the version number and exit.
+--debug Print some extra information during the run.
+--host=... Hostname to be used.
+--port=# TCP/IP port to be used with connection.
+--socket=... MySQL UNIX socket to be used with connection.
+--db=... Database to be used.
+--user=... Username for connecting.
+--password=... Password for the user.
+--stdin Read mails from stdin.
+--max_mail_size=# Maximum size of a mail in bytes.
+ Beware of the downside letting this variable be too big;
+ you may easily end up inserting a lot of attached
+ binary files (like MS Word documents etc), which take
+ space, make the database slower and are not really
+ searchable anyway. (Default $opt_max_mail_size)
+--create Create the mails table. This can be done with the first run.
+--test Dry run. Print the queries and the result as it would be.
+--no_path When inserting the file name, leave out any paths of
+ the name.
+--stop_on_error Stop the run, if an unexpected, but not fatal error occurs
+ during the run. Without this option some fields may get
+ unwanted values. --debug will also report about these.
+EOF
+ }
+ exit(0);
+}