diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 21:30:40 +0000 |
commit | 133a45c109da5310add55824db21af5239951f93 (patch) | |
tree | ba6ac4c0a950a0dda56451944315d66409923918 /contrib/snowball/algorithms/russian.sbl | |
parent | Initial commit. (diff) | |
download | rspamd-upstream.tar.xz rspamd-upstream.zip |
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'contrib/snowball/algorithms/russian.sbl')
-rw-r--r-- | contrib/snowball/algorithms/russian.sbl | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/contrib/snowball/algorithms/russian.sbl b/contrib/snowball/algorithms/russian.sbl new file mode 100644 index 0000000..20de639 --- /dev/null +++ b/contrib/snowball/algorithms/russian.sbl @@ -0,0 +1,221 @@ +stringescapes {} + +/* the 33 Cyrillic letters represented in ASCII characters following the + * conventions of the standard Library of Congress transliteration: */ + +stringdef a '{U+0430}' +stringdef b '{U+0431}' +stringdef v '{U+0432}' +stringdef g '{U+0433}' +stringdef d '{U+0434}' +stringdef e '{U+0435}' +stringdef e" '{U+0451}' +stringdef zh '{U+0436}' +stringdef z '{U+0437}' +stringdef i '{U+0438}' +stringdef i` '{U+0439}' +stringdef k '{U+043A}' +stringdef l '{U+043B}' +stringdef m '{U+043C}' +stringdef n '{U+043D}' +stringdef o '{U+043E}' +stringdef p '{U+043F}' +stringdef r '{U+0440}' +stringdef s '{U+0441}' +stringdef t '{U+0442}' +stringdef u '{U+0443}' +stringdef f '{U+0444}' +stringdef kh '{U+0445}' +stringdef ts '{U+0446}' +stringdef ch '{U+0447}' +stringdef sh '{U+0448}' +stringdef shch '{U+0449}' +stringdef " '{U+044A}' +stringdef y '{U+044B}' +stringdef ' '{U+044C}' +stringdef e` '{U+044D}' +stringdef iu '{U+044E}' +stringdef ia '{U+044F}' + +routines ( mark_regions R2 + perfective_gerund + adjective + adjectival + reflexive + verb + noun + derivational + tidy_up +) + +externals ( stem ) + +integers ( pV p2 ) + +groupings ( v ) + +define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' + +define mark_regions as ( + + $pV = limit + $p2 = limit + do ( + gopast v setmark pV gopast non-v + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define perfective_gerund as ( + [substring] among ( + '{v}' + '{v}{sh}{i}' + '{v}{sh}{i}{s}{'}' + ('{a}' or '{ia}' delete) + '{i}{v}' + '{i}{v}{sh}{i}' + '{i}{v}{sh}{i}{s}{'}' + '{y}{v}' + '{y}{v}{sh}{i}' + '{y}{v}{sh}{i}{s}{'}' + (delete) + ) + ) + + define adjective as ( + [substring] among ( + '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' + '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' + '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' + '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' + '{ia}{ia}' + // and - + '{o}{iu}' // - which is somewhat archaic + '{e}{iu}' // - soft form of {o}{iu} + (delete) + ) + ) + + define adjectival as ( + adjective + + /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. + nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of + errors. Removing im, uem, enn creates too many errors. + */ + + try ( + [substring] among ( + '{e}{m}' // present passive participle + '{n}{n}' // adjective from past passive participle + '{v}{sh}' // past active participle + '{iu}{shch}' '{shch}' // present active participle + ('{a}' or '{ia}' delete) + + //but not '{i}{m}' '{u}{e}{m}' // present passive participle + //or '{e}{n}{n}' // adjective from past passive participle + + '{i}{v}{sh}' '{y}{v}{sh}'// past active participle + '{u}{iu}{shch}' // present active participle + (delete) + ) + ) + + ) + + define reflexive as ( + [substring] among ( + '{s}{ia}' + '{s}{'}' + (delete) + ) + ) + + define verb as ( + [substring] among ( + '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' + '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' + '{n}{y}' '{t}{'}' '{e}{sh}{'}' + + '{n}{n}{o}' + ('{a}' or '{ia}' delete) + + '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' + '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' + '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' + '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' + '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' + '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' + (delete) + /* note the short passive participle tests: + '{n}{a}' '{n}' '{n}{o}' '{n}{y}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' + */ + ) + ) + + define noun as ( + [substring] among ( + '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' + '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' + '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' + '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' + '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' + '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' + (delete) + /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' + omitted - they only occur on 12 words. + */ + ) + ) + + define derivational as ( + [substring] R2 among ( + '{o}{s}{t}' + '{o}{s}{t}{'}' + (delete) + ) + ) + + define tidy_up as ( + [substring] among ( + + '{e}{i`}{sh}' + '{e}{i`}{sh}{e}' // superlative forms + (delete + ['{n}'] '{n}' delete + ) + '{n}' + ('{n}' delete) // e.g. -nno endings + '{'}' + (delete) // with some slight false conflations + ) + ) +) + +define stem as ( + + // Normalise {e"} to {e}. The documentation has long suggested the user + // should do this before calling the stemmer - we now do it for them. + do repeat ( goto (['{e"}']) <- '{e}' ) + + do mark_regions + backwards setlimit tomark pV for ( + do ( + perfective_gerund or + ( try reflexive + adjectival or verb or noun + ) + ) + try([ '{i}' ] delete) + // because noun ending -i{iu} is being treated as verb ending -{iu} + + do derivational + do tidy_up + ) +) |