Adding upstream version 3.8.1.upstream/3.8.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
commit: 133a45c109da5310add55824db21af5239951f93 (patch)
tree: ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/algorithms/hindi.sbl
parent: Initial commit. (diff)
download: rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
1 files changed, 323 insertions, 0 deletions
diff --git a/contrib/snowball/algorithms/hindi.sbl b/contrib/snowball/algorithms/hindi.sbl
new file mode 100644
index 0000000..bfdfac0
--- /dev/null
+++ b/contrib/snowball/algorithms/hindi.sbl
@@ -0,0 +1,323 @@
+// An implementation of "A Lightweight Stemmer for Hindi":
+// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf
+
+externals ( stem )
+
+stringescapes {}
+
+// The transliteration scheme used for our stringdefs matches that used in the
+// paper, as documented in the appendix.  It appears to match the WX notation
+// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently
+// uses 'z' for Anunasika whereas the paper uses Mh.
+//
+// We discriminate dependent vowels by adding a leading "_" to their stringdef
+// names (mnemonic: the _ signifies removing the implicit a from the preceding
+// character).
+
+// Vowels and sonorants:
+stringdef a  '{U+0905}'
+stringdef A  '{U+0906}'
+stringdef i  '{U+0907}'
+stringdef I  '{U+0908}'
+stringdef u  '{U+0909}'
+stringdef U  '{U+090A}'
+stringdef q  '{U+090B}'
+stringdef e  '{U+090F}'
+stringdef E  '{U+0910}'
+stringdef o  '{U+0913}'
+stringdef O  '{U+0914}'
+
+// Vowel signs:
+stringdef _A '{U+093E}'
+stringdef _i '{U+093F}'
+stringdef _I '{U+0940}'
+stringdef _u '{U+0941}'
+stringdef _U '{U+0942}'
+stringdef _q '{U+0943}'
+stringdef _e '{U+0947}'
+stringdef _E '{U+0948}'
+stringdef _o '{U+094B}'
+stringdef _O '{U+094C}'
+
+// Diacritics:
+stringdef M  '{U+0902}'
+stringdef H  '{U+0903}'
+stringdef Mh '{U+0901}'
+stringdef Z  '{U+093C}' // Nukta
+stringdef virama '{U+094D}'
+
+// Velar consonants:
+stringdef k  '{U+0915}'
+stringdef K  '{U+0916}'
+stringdef g  '{U+0917}'
+stringdef G  '{U+0918}'
+stringdef f  '{U+0919}'
+
+// Palatal consonants:
+stringdef c  '{U+091A}'
+stringdef C  '{U+091B}'
+stringdef j  '{U+091C}'
+stringdef J  '{U+091D}'
+stringdef F  '{U+091E}'
+
+// Retroflex consonants:
+stringdef t  '{U+091F}'
+stringdef T  '{U+0920}'
+stringdef d  '{U+0921}'
+stringdef D  '{U+0922}'
+stringdef N  '{U+0923}'
+
+// Dental consonants:
+stringdef w  '{U+0924}'
+stringdef W  '{U+0925}'
+stringdef x  '{U+0926}'
+stringdef X  '{U+0927}'
+stringdef n  '{U+0928}'
+
+// Labial consonants:
+stringdef p  '{U+092A}'
+stringdef P  '{U+092B}'
+stringdef b  '{U+092C}'
+stringdef B  '{U+092D}'
+stringdef m  '{U+092E}'
+
+// Semi-vowels:
+stringdef y  '{U+092F}'
+stringdef r  '{U+0930}'
+stringdef l  '{U+0932}'
+stringdef v  '{U+0935}'
+
+// Fricatives:
+stringdef S  '{U+0936}'
+stringdef R  '{U+0937}'
+stringdef s  '{U+0938}'
+stringdef h  '{U+0939}'
+
+stringdef lY '{U+0933}'
+
+// Precomposed characters - letters + nukta:
+stringdef nZ '{U+0929}' // ≡ {n}{Z}
+stringdef rZ '{U+0931}' // ≡ {r}{Z}
+stringdef lYZ '{U+0934}' // ≡ {lY}{Z}
+stringdef kZ '{U+0958}' // ≡ {k}{Z}
+stringdef KZ '{U+0959}' // ≡ {K}{Z}
+stringdef gZ '{U+095A}' // ≡ {g}{Z}
+stringdef jZ '{U+095B}' // ≡ {j}{Z}
+stringdef dZ '{U+095C}' // ≡ {d}{Z}
+stringdef DZ '{U+095D}' // ≡ {D}{Z}
+stringdef PZ '{U+095E}' // ≡ {P}{Z}
+stringdef yZ '{U+095F}' // ≡ {y}{Z}
+
+integers ( p )
+
+groupings ( consonant )
+
+routines ( CONSONANT )
+
+define consonant '{k}{K}{g}{G}{f}' +
+                 '{c}{C}{j}{J}{F}' +
+                 '{t}{T}{d}{D}{N}' +
+                 '{w}{W}{x}{X}{n}' +
+                 '{p}{P}{b}{B}{m}' +
+                 '{y}{r}{l}{v}' +
+                 '{S}{R}{s}{h}' +
+                 '{lY}' +
+                 '{Z}' + // Nukta
+                 // Precomposed characters - letter and nukta:
+                 '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}'
+
+backwardmode ( define CONSONANT as ( consonant ) )
+
+define stem as (
+    test ( next setmark p )
+    backwards (
+        // We assume in this implementation that the whole word doesn't count
+        // as a valid suffix to remove, so we remove the longest suffix from
+        // the list which leaves at least one character.  This change affects
+        // 47 words out of the 65,140 in the sample vocabulary from Hindi
+        // wikipedia.
+        setlimit tomark p for ([substring])
+        among (
+            // The list below is derived from figure 3 in the paper.
+            //
+            // We perform the stemming on the Devanagari characters rather than
+            // transliterating to Latin, so we have adapted the list below to
+            // reflect this by converting suffixes back to Devanagari as
+            // follows:
+            //
+            // * within the suffixes, "a" after a consonant is dropped since
+            //   consonants have an implicit "a".
+            //
+            // * within the suffixes, a vowel other than "a" after a consonant
+            //   is a dependent vowel (vowel sign); a vowel (including "a")
+            //   after a non-consonant is an independent vowel.
+            //
+            // * to allow the vowel at the start of each suffix being dependent
+            //   or independent, we include each suffix twice.  For the
+            //   dependent version, a leading "a" is dropped and we check that
+            //   the suffix is preceded by a consonant (which will have an
+            //   implicit "a").
+            //
+            // * we add '{a}', which is needed for the example given right at
+            //   the end of section 5 to work (conflating BarawIya and
+            //   BarawIyawA), and which 3.1 a.v strongly suggests should be in
+            //   the list:
+            //
+            //     Thus, the following suffix deletions (longest possible
+            //     match) are required to reduce inflected forms of masculine
+            //     nouns to a common stem:
+            //     a A i [...]
+            //
+            //   Adding '{a}' only affect 2 words out of the 65,140 in the
+            //   sample vocabulary.
+            //
+            // * The transliterations of our stems would end with "a" when our
+            //   stems end in a consonant, so we also include {virama} in the
+            //   list of suffixes to remove (this affects 222 words from the
+            //   sample vocabulary).
+            //
+            // We've also assumed that Mh in the suffix list always means {Mh}
+            // and never {M}{h}{virama}.  Only one of the 65,140 words in the
+            // sample vocabulary stems differently due to this (and that word
+            // seems to be a typo).
+
+            '{virama}'
+
+            '{a}'
+            '{A}'
+            '{i}'
+            '{I}'
+            '{u}'
+            '{U}'
+            '{e}'
+            '{o}'
+            '{e}{M}'
+            '{o}{M}'
+            '{A}{M}'
+            '{u}{A}{M}'
+            '{u}{e}{M}'
+            '{u}{o}{M}'
+            '{A}{e}{M}'
+            '{A}{o}{M}'
+            '{i}{y}{_A}{M}'
+            '{i}{y}{_o}{M}'
+            '{A}{i}{y}{_A}{M}'
+            '{A}{i}{y}{_o}{M}'
+            '{A}{Mh}'
+            '{i}{y}{_A}{Mh}'
+            '{A}{i}{y}{_A}{Mh}'
+            '{a}{w}{_A}{e}{M}'
+            '{a}{w}{_A}{o}{M}'
+            '{a}{n}{_A}{e}{M}'
+            '{a}{n}{_A}{o}{M}'
+            '{a}{w}{_A}'
+            '{a}{w}{_I}'
+            '{I}{M}'
+            '{a}{w}{_I}{M}'
+            '{a}{w}{_e}'
+            '{A}{w}{_A}'
+            '{A}{w}{_I}'
+            '{A}{w}{_I}{M}'
+            '{A}{w}{_e}'
+            '{a}{n}{_A}'
+            '{a}{n}{_I}'
+            '{a}{n}{_e}'
+            '{A}{n}{_A}'
+            '{A}{n}{_e}'
+            '{U}{M}{g}{_A}'
+            '{U}{M}{g}{_I}'
+            '{A}{U}{M}{g}{_A}'
+            '{A}{U}{M}{g}{_I}'
+            '{e}{M}{g}{_e}'
+            '{e}{M}{g}{_I}'
+            '{A}{e}{M}{g}{_e}'
+            '{A}{e}{M}{g}{_I}'
+            '{o}{g}{_e}'
+            '{o}{g}{_I}'
+            '{A}{o}{g}{_e}'
+            '{A}{o}{g}{_I}'
+            '{e}{g}{_A}'
+            '{e}{g}{_I}'
+            '{A}{e}{g}{_A}'
+            '{A}{e}{g}{_I}'
+            '{A}{y}{_A}'
+            '{A}{e}'
+            '{A}{I}'
+            '{A}{I}{M}'
+            '{i}{e}'
+            '{A}{o}'
+            '{A}{i}{e}'
+            '{a}{k}{r}'
+            '{A}{k}{r}'
+
+            '{_A}'
+            '{_i}'
+            '{_I}'
+            '{_u}'
+            '{_U}'
+            '{_e}'
+            '{_o}'
+            '{_e}{M}'
+            '{_o}{M}'
+            '{_A}{M}'
+            '{_u}{A}{M}'
+            '{_u}{e}{M}'
+            '{_u}{o}{M}'
+            '{_A}{e}{M}'
+            '{_A}{o}{M}'
+            '{_i}{y}{_A}{M}'
+            '{_i}{y}{_o}{M}'
+            '{_A}{i}{y}{_A}{M}'
+            '{_A}{i}{y}{_o}{M}'
+            '{_A}{Mh}'
+            '{_i}{y}{_A}{Mh}'
+            '{_A}{i}{y}{_A}{Mh}'
+            '{_I}{M}'
+            '{_A}{w}{_A}'
+            '{_A}{w}{_I}'
+            '{_A}{w}{_I}{M}'
+            '{_A}{w}{_e}'
+            '{_A}{n}{_A}'
+            '{_A}{n}{_e}'
+            '{_U}{M}{g}{_A}'
+            '{_U}{M}{g}{_I}'
+            '{_A}{U}{M}{g}{_A}'
+            '{_A}{U}{M}{g}{_I}'
+            '{_e}{M}{g}{_e}'
+            '{_e}{M}{g}{_I}'
+            '{_A}{e}{M}{g}{_e}'
+            '{_A}{e}{M}{g}{_I}'
+            '{_o}{g}{_e}'
+            '{_o}{g}{_I}'
+            '{_A}{o}{g}{_e}'
+            '{_A}{o}{g}{_I}'
+            '{_e}{g}{_A}'
+            '{_e}{g}{_I}'
+            '{_A}{e}{g}{_A}'
+            '{_A}{e}{g}{_I}'
+            '{_A}{y}{_A}'
+            '{_A}{e}'
+            '{_A}{I}'
+            '{_A}{I}{M}'
+            '{_i}{e}'
+            '{_A}{o}'
+            '{_A}{i}{e}'
+            '{_A}{k}{r}'
+
+            /* Suffixes with a leading implicit a: */
+            '{w}{_A}{e}{M}' CONSONANT
+            '{w}{_A}{o}{M}' CONSONANT
+            '{n}{_A}{e}{M}' CONSONANT
+            '{n}{_A}{o}{M}' CONSONANT
+            '{w}{_A}' CONSONANT
+            '{w}{_I}' CONSONANT
+            '{w}{_I}{M}' CONSONANT
+            '{w}{_e}' CONSONANT
+            '{n}{_A}' CONSONANT
+            '{n}{_I}' CONSONANT
+            '{n}{_e}' CONSONANT
+            '{k}{r}' CONSONANT
+        )
+        delete
+    )
+)
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
commit	133a45c109da5310add55824db21af5239951f93 (patch)
tree	ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/algorithms/hindi.sbl
parent	Initial commit. (diff)
download	rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip