Adding upstream version 3.8.1.upstream/3.8.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
commit: 133a45c109da5310add55824db21af5239951f93 (patch)
tree: ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/libstemmer
parent: Initial commit. (diff)
download: rspamd-upstream.tar.xz
rspamd-upstream.zip
4 files changed, 470 insertions, 0 deletions
diff --git a/contrib/snowball/libstemmer/libstemmer_c.in b/contrib/snowball/libstemmer/libstemmer_c.in
new file mode 100644
index 0000000..2aa918d
--- /dev/null
+++ b/contrib/snowball/libstemmer/libstemmer_c.in
@@ -0,0 +1,96 @@
+
+#include <stdlib.h>
+#include <string.h>
+#include "../include/libstemmer.h"
+#include "../runtime/api.h"
+#include "@MODULES_H@"
+
+struct sb_stemmer {
+    struct SN_env * (*create)(void);
+    void (*close)(struct SN_env *);
+    int (*stem)(struct SN_env *);
+
+    struct SN_env * env;
+};
+
+extern const char **
+sb_stemmer_list(void)
+{
+    return algorithm_names;
+}
+
+static stemmer_encoding_t
+sb_getenc(const char * charenc)
+{
+    const struct stemmer_encoding * encoding;
+    if (charenc == NULL) return ENC_UTF_8;
+    for (encoding = encodings; encoding->name != 0; encoding++) {
+        if (strcmp(encoding->name, charenc) == 0) break;
+    }
+    if (encoding->name == NULL) return ENC_UNKNOWN;
+    return encoding->enc;
+}
+
+extern struct sb_stemmer *
+sb_stemmer_new(const char * algorithm, const char * charenc)
+{
+    stemmer_encoding_t enc;
+    const struct stemmer_modules * module;
+    struct sb_stemmer * stemmer;
+
+    enc = sb_getenc(charenc);
+    if (enc == ENC_UNKNOWN) return NULL;
+
+    for (module = modules; module->name != 0; module++) {
+        if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+    }
+    if (module->name == NULL) return NULL;
+    
+    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+    if (stemmer == NULL) return NULL;
+
+    stemmer->create = module->create;
+    stemmer->close = module->close;
+    stemmer->stem = module->stem;
+
+    stemmer->env = stemmer->create();
+    if (stemmer->env == NULL)
+    {
+        sb_stemmer_delete(stemmer);
+        return NULL;
+    }
+
+    return stemmer;
+}
+
+void
+sb_stemmer_delete(struct sb_stemmer * stemmer)
+{
+    if (stemmer == 0) return;
+    if (stemmer->close) {
+        stemmer->close(stemmer->env);
+        stemmer->close = 0;
+    }
+    free(stemmer);
+}
+
+const sb_symbol *
+sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size)
+{
+    int ret;
+    if (SN_set_current(stemmer->env, size, (const symbol *)(word)))
+    {
+        stemmer->env->l = 0;
+        return NULL;
+    }
+    ret = stemmer->stem(stemmer->env);
+    if (ret < 0) return NULL;
+    stemmer->env->p[stemmer->env->l] = 0;
+    return (const sb_symbol *)(stemmer->env->p);
+}
+
+int
+sb_stemmer_length(struct sb_stemmer * stemmer)
+{
+    return stemmer->env->l;
+}
diff --git a/contrib/snowball/libstemmer/mkmodules.pl b/contrib/snowball/libstemmer/mkmodules.pl
new file mode 100755
index 0000000..dd66787
--- /dev/null
+++ b/contrib/snowball/libstemmer/mkmodules.pl
@@ -0,0 +1,267 @@
+#!/usr/bin/env perl
+use strict;
+use 5.006;
+use warnings;
+
+my $progname = $0;
+
+if (scalar @ARGV < 4 || scalar @ARGV > 5) {
+  print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n";
+  exit 1;
+}
+
+my $outname = shift(@ARGV);
+my $c_src_dir = shift(@ARGV);
+my $descfile = shift(@ARGV);
+my $srclistfile = shift(@ARGV);
+my $enc_only;
+my $extn = '';
+if (@ARGV) {
+  $enc_only = shift(@ARGV);
+  $extn = '_'.$enc_only;
+}
+
+my %aliases = ();
+my %algorithms = ();
+my %algorithm_encs = ();
+
+my %encs = ();
+
+sub addalgenc($$) {
+  my $alg = shift();
+  my $enc = shift();
+
+  if (defined $enc_only) {
+      my $norm_enc = lc $enc;
+      $norm_enc =~ s/_//g;
+      if ($norm_enc ne $enc_only) {
+	  return;
+      }
+  }
+
+  if (defined $algorithm_encs{$alg}) {
+      my $hashref = $algorithm_encs{$alg};
+      $$hashref{$enc}=1;
+  } else {
+      my %newhash = ($enc => 1);
+      $algorithm_encs{$alg}=\%newhash;
+  }
+
+  $encs{$enc} = 1;
+}
+
+sub readinput()
+{
+    open DESCFILE, $descfile;
+    my $line;
+    while ($line = <DESCFILE>)
+    {
+        next if $line =~ m/^\s*#/;
+        next if $line =~ m/^\s*$/;
+        my ($alg,$encstr,$aliases) = split(/\s+/, $line);
+        my $enc;
+        my $alias;
+
+        $algorithms{$alg} = 1;
+        foreach $alias (split(/,/, $aliases)) {
+            foreach $enc (split(/,/, $encstr)) {
+                # print "$alias, $enc\n";
+                $aliases{$alias} = $alg;
+                addalgenc($alg, $enc);
+            }
+        }
+    }
+}
+
+sub printoutput()
+{
+    open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n";
+
+    print OUT <<EOS;
+/* $outname: List of stemming modules.
+ *
+ * This file is generated by mkmodules.pl from a list of module names.
+ * Do not edit manually.
+ *
+EOS
+
+    my $line = " * Modules included by this file are: ";
+    print OUT $line;
+    my $linelen = length($line);
+
+    my $need_sep = 0;
+    my $lang;
+    my $enc;
+    my @algorithms = sort keys(%algorithms);
+    foreach $lang (@algorithms) {
+        if ($need_sep) {
+            if (($linelen + 2 + length($lang)) > 77) {
+                print OUT ",\n * ";
+                $linelen = 3;
+            } else {
+                print OUT ', ';
+                $linelen += 2;
+            }
+        }
+        print OUT $lang;
+        $linelen += length($lang);
+        $need_sep = 1;
+    }
+    print OUT "\n */\n\n";
+
+    foreach $lang (@algorithms) {
+        my $hashref = $algorithm_encs{$lang};
+        foreach $enc (sort keys (%$hashref)) {
+            print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n";
+        }
+    }
+
+    print OUT <<EOS;
+
+typedef enum {
+  ENC_UNKNOWN=0,
+EOS
+    my $neednl = 0;
+    for $enc (sort keys %encs) {
+        print OUT ",\n" if $neednl;
+        print OUT "  ENC_${enc}";
+        $neednl = 1;
+    }
+    print OUT <<EOS;
+
+} stemmer_encoding_t;
+
+struct stemmer_encoding {
+  const char * name;
+  stemmer_encoding_t enc;
+};
+static const struct stemmer_encoding encodings[] = {
+EOS
+    for $enc (sort keys %encs) {
+        print OUT "  {\"${enc}\", ENC_${enc}},\n";
+    }
+    print OUT <<EOS;
+  {0,ENC_UNKNOWN}
+};
+
+struct stemmer_modules {
+  const char * name;
+  stemmer_encoding_t enc; 
+  struct SN_env * (*create)(void);
+  void (*close)(struct SN_env *);
+  int (*stem)(struct SN_env *);
+};
+static const struct stemmer_modules modules[] = {
+EOS
+
+    for $lang (sort keys %aliases) {
+        my $l = $aliases{$lang};
+        my $hashref = $algorithm_encs{$l};
+        my $enc;
+        foreach $enc (sort keys (%$hashref)) {
+            my $p = "${l}_${enc}";
+            print OUT "  {\"$lang\", ENC_$enc, ${p}_create_env, ${p}_close_env, ${p}_stem},\n";
+        }
+    }
+
+    print OUT <<EOS;
+  {0,ENC_UNKNOWN,0,0,0}
+};
+EOS
+
+    print OUT <<EOS;
+static const char * algorithm_names[] = {
+EOS
+
+    for $lang (@algorithms) {
+        print OUT "  \"$lang\", \n";
+    }
+
+    print OUT <<EOS;
+  0
+};
+EOS
+    close OUT or die "Can't close ${outname}: $!\n";
+}
+
+sub printsrclist()
+{
+    open (OUT, ">$srclistfile") or die "Can't open output file `$srclistfile': $!\n";
+
+    print OUT <<EOS;
+# $srclistfile: List of stemming module source files
+#
+# This file is generated by mkmodules.pl from a list of module names.
+# Do not edit manually.
+#
+EOS
+
+    my $line = "# Modules included by this file are: ";
+    print OUT $line;
+    my $linelen = length($line);
+
+    my $need_sep = 0;
+    my $lang;
+    my $srcfile;
+    my $enc;
+    my @algorithms = sort keys(%algorithms);
+    foreach $lang (@algorithms) {
+        if ($need_sep) {
+            if (($linelen + 2 + length($lang)) > 77) {
+                print OUT ",\n# ";
+                $linelen = 3;
+            } else {
+                print OUT ', ';
+                $linelen += 2;
+            }
+        }
+        print OUT $lang;
+        $linelen += length($lang);
+        $need_sep = 1;
+    }
+
+    print OUT "\n\nsnowball_sources= \\\n";
+    for $lang (sort keys %aliases) {
+        my $hashref = $algorithm_encs{$lang};
+        my $enc;
+        foreach $enc (sort keys (%$hashref)) {
+            print OUT "  src_c/stem_${enc}_${lang}.c \\\n";
+        }
+    }
+
+    $need_sep = 0;
+    for $srcfile ('runtime/api.c',
+                  'runtime/utilities.c',
+                  "libstemmer/libstemmer${extn}.c") {
+        print OUT " \\\n" if $need_sep;
+        print OUT "  $srcfile";
+        $need_sep = 1;
+    }
+
+    print OUT "\n\nsnowball_headers= \\\n";
+    for $lang (sort keys %aliases) {
+        my $hashref = $algorithm_encs{$lang};
+        my $enc;
+        foreach $enc (sort keys (%$hashref)) {
+            my $p = "${lang}_${enc}";
+            print OUT "  src_c/stem_${enc}_${lang}.h \\\n";
+        }
+    }
+
+    $need_sep = 0;
+    for $srcfile ('include/libstemmer.h',
+                  "libstemmer/modules${extn}.h",
+                  'runtime/api.h',
+                  'runtime/header.h') {
+        print OUT " \\\n" if $need_sep;
+        print OUT "  $srcfile";
+        $need_sep = 1;
+    }
+
+    print OUT "\n\n";
+    close OUT or die "Can't close ${srclistfile}: $!\n";
+}
+
+readinput();
+printoutput();
+printsrclist();
diff --git a/contrib/snowball/libstemmer/modules.txt b/contrib/snowball/libstemmer/modules.txt
new file mode 100644
index 0000000..f6dcc7e
--- /dev/null
+++ b/contrib/snowball/libstemmer/modules.txt
@@ -0,0 +1,58 @@
+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+#  First item is name of stemmer.
+#  Second item is comma separated list of character sets.
+#  Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8, and also with
+# the most commonly used encoding.
+
+arabic          UTF_8                   arabic,ar,ara
+danish          UTF_8        danish,da,dan
+dutch           UTF_8        dutch,nl,dut,nld
+english         UTF_8        english,en,eng
+finnish         UTF_8        finnish,fi,fin
+french          UTF_8        french,fr,fre,fra
+german          UTF_8        german,de,ger,deu
+greek           UTF_8                   greek,el,gre,ell
+hindi           UTF_8                   hindi,hi,hin
+hungarian       UTF_8        hungarian,hu,hun
+indonesian      UTF_8        indonesian,id,ind
+italian         UTF_8        italian,it,ita
+lithuanian      UTF_8                   lithuanian,lt,lit
+nepali          UTF_8                   nepali,ne,nep
+norwegian       UTF_8        norwegian,no,nor
+portuguese      UTF_8        portuguese,pt,por
+romanian        UTF_8        romanian,ro,rum,ron
+russian         UTF_8            russian,ru,rus
+serbian         UTF_8                   serbian,sr,srp
+spanish         UTF_8        spanish,es,esl,spa
+swedish         UTF_8        swedish,sv,swe
+tamil           UTF_8                   tamil,ta,tam
+turkish         UTF_8                   turkish,tr,tur
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter          UTF_8        porter			english
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported.  These
+# algorithms are:
+#
+# german2          - This is a slight modification of the german stemmer.
+#german2          UTF_8,ISO_8859_1        german2		german
+#
+# kraaij_pohlmann  - This is a different dutch stemmer.
+#kraaij_pohlmann  UTF_8,ISO_8859_1        kraaij_pohlmann	dutch
+#
+# lovins           - This is an english stemmer, but fairly outdated, and
+#                    only really applicable to a restricted type of input text
+#                    (keywords in academic publications).
+#lovins           UTF_8,ISO_8859_1        lovins		english
diff --git a/contrib/snowball/libstemmer/modules_utf8.txt b/contrib/snowball/libstemmer/modules_utf8.txt
new file mode 100644
index 0000000..60a0e1d
--- /dev/null
+++ b/contrib/snowball/libstemmer/modules_utf8.txt
@@ -0,0 +1,49 @@
+# This file contains a list of stemmers to include in the distribution.
+# The format is a set of space separated lines - on each line:
+#  First item is name of stemmer.
+#  Second item is comma separated list of character sets.
+#  Third item is comma separated list of names to refer to the stemmer by.
+#
+# Lines starting with a #, or blank lines, are ignored.
+
+# List all the main algorithms for each language, in UTF-8.
+
+danish          UTF_8                   danish,da,dan
+dutch           UTF_8                   dutch,nl,dut,nld
+english         UTF_8                   english,en,eng
+finnish         UTF_8                   finnish,fi,fin
+french          UTF_8                   french,fr,fre,fra
+german          UTF_8                   german,de,ger,deu
+hungarian       UTF_8                   hungarian,hu,hun
+italian         UTF_8                   italian,it,ita
+norwegian       UTF_8                   norwegian,no,nor
+portuguese      UTF_8                   portuguese,pt,por
+romanian        UTF_8                   romanian,ro,rum,ron
+russian         UTF_8                   russian,ru,rus
+spanish         UTF_8                   spanish,es,esl,spa
+swedish         UTF_8                   swedish,sv,swe
+turkish         UTF_8                   turkish,tr,tur
+
+# Also include the traditional porter algorithm for english.
+# The porter algorithm is included in the libstemmer distribution to assist
+# with backwards compatibility, but for new systems the english algorithm
+# should be used in preference.
+porter          UTF_8                   porter
+
+# Some other stemmers in the snowball project are not included in the standard
+# distribution. To compile a libstemmer with them in, add them to this list,
+# and regenerate the distribution. (You will need a full source checkout for
+# this.) They are included in the snowball website as curiosities, but are not
+# intended for general use, and use of them is is not fully supported.  These
+# algorithms are:
+#
+# german2          - This is a slight modification of the german stemmer.
+#german2          UTF_8                   german2
+#
+# kraaij_pohlmann  - This is a different dutch stemmer.
+#kraaij_pohlmann  UTF_8                   kraaij_pohlmann
+#
+# lovins           - This is an english stemmer, but fairly outdated, and
+#                    only really applicable to a restricted type of input text
+#                    (keywords in academic publications).
+#lovins           UTF_8                   lovins
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
commit	133a45c109da5310add55824db21af5239951f93 (patch)
tree	ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/libstemmer
parent	Initial commit. (diff)
download	rspamd-upstream.tar.xz rspamd-upstream.zip