diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/libstemmer | |
parent | Initial commit. (diff) | |
download | rspamd-upstream.tar.xz rspamd-upstream.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/snowball/libstemmer')
-rw-r--r-- | contrib/snowball/libstemmer/libstemmer_c.in | 96 | ||||
-rwxr-xr-x | contrib/snowball/libstemmer/mkmodules.pl | 267 | ||||
-rw-r--r-- | contrib/snowball/libstemmer/modules.txt | 58 | ||||
-rw-r--r-- | contrib/snowball/libstemmer/modules_utf8.txt | 49 |
4 files changed, 470 insertions, 0 deletions
diff --git a/contrib/snowball/libstemmer/libstemmer_c.in b/contrib/snowball/libstemmer/libstemmer_c.in new file mode 100644 index 0000000..2aa918d --- /dev/null +++ b/contrib/snowball/libstemmer/libstemmer_c.in @@ -0,0 +1,96 @@ + +#include <stdlib.h> +#include <string.h> +#include "../include/libstemmer.h" +#include "../runtime/api.h" +#include "@MODULES_H@" + +struct sb_stemmer { + struct SN_env * (*create)(void); + void (*close)(struct SN_env *); + int (*stem)(struct SN_env *); + + struct SN_env * env; +}; + +extern const char ** +sb_stemmer_list(void) +{ + return algorithm_names; +} + +static stemmer_encoding_t +sb_getenc(const char * charenc) +{ + const struct stemmer_encoding * encoding; + if (charenc == NULL) return ENC_UTF_8; + for (encoding = encodings; encoding->name != 0; encoding++) { + if (strcmp(encoding->name, charenc) == 0) break; + } + if (encoding->name == NULL) return ENC_UNKNOWN; + return encoding->enc; +} + +extern struct sb_stemmer * +sb_stemmer_new(const char * algorithm, const char * charenc) +{ + stemmer_encoding_t enc; + const struct stemmer_modules * module; + struct sb_stemmer * stemmer; + + enc = sb_getenc(charenc); + if (enc == ENC_UNKNOWN) return NULL; + + for (module = modules; module->name != 0; module++) { + if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; + } + if (module->name == NULL) return NULL; + + stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); + if (stemmer == NULL) return NULL; + + stemmer->create = module->create; + stemmer->close = module->close; + stemmer->stem = module->stem; + + stemmer->env = stemmer->create(); + if (stemmer->env == NULL) + { + sb_stemmer_delete(stemmer); + return NULL; + } + + return stemmer; +} + +void +sb_stemmer_delete(struct sb_stemmer * stemmer) +{ + if (stemmer == 0) return; + if (stemmer->close) { + stemmer->close(stemmer->env); + stemmer->close = 0; + } + free(stemmer); +} + +const sb_symbol * +sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) +{ + int ret; + if (SN_set_current(stemmer->env, size, (const symbol *)(word))) + { + stemmer->env->l = 0; + return NULL; + } + ret = stemmer->stem(stemmer->env); + if (ret < 0) return NULL; + stemmer->env->p[stemmer->env->l] = 0; + return (const sb_symbol *)(stemmer->env->p); +} + +int +sb_stemmer_length(struct sb_stemmer * stemmer) +{ + return stemmer->env->l; +} diff --git a/contrib/snowball/libstemmer/mkmodules.pl b/contrib/snowball/libstemmer/mkmodules.pl new file mode 100755 index 0000000..dd66787 --- /dev/null +++ b/contrib/snowball/libstemmer/mkmodules.pl @@ -0,0 +1,267 @@ +#!/usr/bin/env perl +use strict; +use 5.006; +use warnings; + +my $progname = $0; + +if (scalar @ARGV < 4 || scalar @ARGV > 5) { + print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n"; + exit 1; +} + +my $outname = shift(@ARGV); +my $c_src_dir = shift(@ARGV); +my $descfile = shift(@ARGV); +my $srclistfile = shift(@ARGV); +my $enc_only; +my $extn = ''; +if (@ARGV) { + $enc_only = shift(@ARGV); + $extn = '_'.$enc_only; +} + +my %aliases = (); +my %algorithms = (); +my %algorithm_encs = (); + +my %encs = (); + +sub addalgenc($$) { + my $alg = shift(); + my $enc = shift(); + + if (defined $enc_only) { + my $norm_enc = lc $enc; + $norm_enc =~ s/_//g; + if ($norm_enc ne $enc_only) { + return; + } + } + + if (defined $algorithm_encs{$alg}) { + my $hashref = $algorithm_encs{$alg}; + $$hashref{$enc}=1; + } else { + my %newhash = ($enc => 1); + $algorithm_encs{$alg}=\%newhash; + } + + $encs{$enc} = 1; +} + +sub readinput() +{ + open DESCFILE, $descfile; + my $line; + while ($line = <DESCFILE>) + { + next if $line =~ m/^\s*#/; + next if $line =~ m/^\s*$/; + my ($alg,$encstr,$aliases) = split(/\s+/, $line); + my $enc; + my $alias; + + $algorithms{$alg} = 1; + foreach $alias (split(/,/, $aliases)) { + foreach $enc (split(/,/, $encstr)) { + # print "$alias, $enc\n"; + $aliases{$alias} = $alg; + addalgenc($alg, $enc); + } + } + } +} + +sub printoutput() +{ + open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; + + print OUT <<EOS; +/* $outname: List of stemming modules. + * + * This file is generated by mkmodules.pl from a list of module names. + * Do not edit manually. + * +EOS + + my $line = " * Modules included by this file are: "; + print OUT $line; + my $linelen = length($line); + + my $need_sep = 0; + my $lang; + my $enc; + my @algorithms = sort keys(%algorithms); + foreach $lang (@algorithms) { + if ($need_sep) { + if (($linelen + 2 + length($lang)) > 77) { + print OUT ",\n * "; + $linelen = 3; + } else { + print OUT ', '; + $linelen += 2; + } + } + print OUT $lang; + $linelen += length($lang); + $need_sep = 1; + } + print OUT "\n */\n\n"; + + foreach $lang (@algorithms) { + my $hashref = $algorithm_encs{$lang}; + foreach $enc (sort keys (%$hashref)) { + print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n"; + } + } + + print OUT <<EOS; + +typedef enum { + ENC_UNKNOWN=0, +EOS + my $neednl = 0; + for $enc (sort keys %encs) { + print OUT ",\n" if $neednl; + print OUT " ENC_${enc}"; + $neednl = 1; + } + print OUT <<EOS; + +} stemmer_encoding_t; + +struct stemmer_encoding { + const char * name; + stemmer_encoding_t enc; +}; +static const struct stemmer_encoding encodings[] = { +EOS + for $enc (sort keys %encs) { + print OUT " {\"${enc}\", ENC_${enc}},\n"; + } + print OUT <<EOS; + {0,ENC_UNKNOWN} +}; + +struct stemmer_modules { + const char * name; + stemmer_encoding_t enc; + struct SN_env * (*create)(void); + void (*close)(struct SN_env *); + int (*stem)(struct SN_env *); +}; +static const struct stemmer_modules modules[] = { +EOS + + for $lang (sort keys %aliases) { + my $l = $aliases{$lang}; + my $hashref = $algorithm_encs{$l}; + my $enc; + foreach $enc (sort keys (%$hashref)) { + my $p = "${l}_${enc}"; + print OUT " {\"$lang\", ENC_$enc, ${p}_create_env, ${p}_close_env, ${p}_stem},\n"; + } + } + + print OUT <<EOS; + {0,ENC_UNKNOWN,0,0,0} +}; +EOS + + print OUT <<EOS; +static const char * algorithm_names[] = { +EOS + + for $lang (@algorithms) { + print OUT " \"$lang\", \n"; + } + + print OUT <<EOS; + 0 +}; +EOS + close OUT or die "Can't close ${outname}: $!\n"; +} + +sub printsrclist() +{ + open (OUT, ">$srclistfile") or die "Can't open output file `$srclistfile': $!\n"; + + print OUT <<EOS; +# $srclistfile: List of stemming module source files +# +# This file is generated by mkmodules.pl from a list of module names. +# Do not edit manually. +# +EOS + + my $line = "# Modules included by this file are: "; + print OUT $line; + my $linelen = length($line); + + my $need_sep = 0; + my $lang; + my $srcfile; + my $enc; + my @algorithms = sort keys(%algorithms); + foreach $lang (@algorithms) { + if ($need_sep) { + if (($linelen + 2 + length($lang)) > 77) { + print OUT ",\n# "; + $linelen = 3; + } else { + print OUT ', '; + $linelen += 2; + } + } + print OUT $lang; + $linelen += length($lang); + $need_sep = 1; + } + + print OUT "\n\nsnowball_sources= \\\n"; + for $lang (sort keys %aliases) { + my $hashref = $algorithm_encs{$lang}; + my $enc; + foreach $enc (sort keys (%$hashref)) { + print OUT " src_c/stem_${enc}_${lang}.c \\\n"; + } + } + + $need_sep = 0; + for $srcfile ('runtime/api.c', + 'runtime/utilities.c', + "libstemmer/libstemmer${extn}.c") { + print OUT " \\\n" if $need_sep; + print OUT " $srcfile"; + $need_sep = 1; + } + + print OUT "\n\nsnowball_headers= \\\n"; + for $lang (sort keys %aliases) { + my $hashref = $algorithm_encs{$lang}; + my $enc; + foreach $enc (sort keys (%$hashref)) { + my $p = "${lang}_${enc}"; + print OUT " src_c/stem_${enc}_${lang}.h \\\n"; + } + } + + $need_sep = 0; + for $srcfile ('include/libstemmer.h', + "libstemmer/modules${extn}.h", + 'runtime/api.h', + 'runtime/header.h') { + print OUT " \\\n" if $need_sep; + print OUT " $srcfile"; + $need_sep = 1; + } + + print OUT "\n\n"; + close OUT or die "Can't close ${srclistfile}: $!\n"; +} + +readinput(); +printoutput(); +printsrclist(); diff --git a/contrib/snowball/libstemmer/modules.txt b/contrib/snowball/libstemmer/modules.txt new file mode 100644 index 0000000..f6dcc7e --- /dev/null +++ b/contrib/snowball/libstemmer/modules.txt @@ -0,0 +1,58 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8, and also with +# the most commonly used encoding. + +arabic UTF_8 arabic,ar,ara +danish UTF_8 danish,da,dan +dutch UTF_8 dutch,nl,dut,nld +english UTF_8 english,en,eng +finnish UTF_8 finnish,fi,fin +french UTF_8 french,fr,fre,fra +german UTF_8 german,de,ger,deu +greek UTF_8 greek,el,gre,ell +hindi UTF_8 hindi,hi,hin +hungarian UTF_8 hungarian,hu,hun +indonesian UTF_8 indonesian,id,ind +italian UTF_8 italian,it,ita +lithuanian UTF_8 lithuanian,lt,lit +nepali UTF_8 nepali,ne,nep +norwegian UTF_8 norwegian,no,nor +portuguese UTF_8 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8 russian,ru,rus +serbian UTF_8 serbian,sr,srp +spanish UTF_8 spanish,es,esl,spa +swedish UTF_8 swedish,sv,swe +tamil UTF_8 tamil,ta,tam +turkish UTF_8 turkish,tr,tur + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8 porter english + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# german2 - This is a slight modification of the german stemmer. +#german2 UTF_8,ISO_8859_1 german2 german +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8,ISO_8859_1 lovins english diff --git a/contrib/snowball/libstemmer/modules_utf8.txt b/contrib/snowball/libstemmer/modules_utf8.txt new file mode 100644 index 0000000..60a0e1d --- /dev/null +++ b/contrib/snowball/libstemmer/modules_utf8.txt @@ -0,0 +1,49 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8. + +danish UTF_8 danish,da,dan +dutch UTF_8 dutch,nl,dut,nld +english UTF_8 english,en,eng +finnish UTF_8 finnish,fi,fin +french UTF_8 french,fr,fre,fra +german UTF_8 german,de,ger,deu +hungarian UTF_8 hungarian,hu,hun +italian UTF_8 italian,it,ita +norwegian UTF_8 norwegian,no,nor +portuguese UTF_8 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8 russian,ru,rus +spanish UTF_8 spanish,es,esl,spa +swedish UTF_8 swedish,sv,swe +turkish UTF_8 turkish,tr,tur + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8 porter + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# german2 - This is a slight modification of the german stemmer. +#german2 UTF_8 german2 +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8 kraaij_pohlmann +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8 lovins |