diff options
Diffstat (limited to 'contrib/snowball')
59 files changed, 16546 insertions, 0 deletions
diff --git a/contrib/snowball/.gitignore b/contrib/snowball/.gitignore new file mode 100644 index 0000000..2147da8 --- /dev/null +++ b/contrib/snowball/.gitignore @@ -0,0 +1,5 @@ +*.o +/libstemmer +/snowball +/src_c +/stemwords diff --git a/contrib/snowball/.travis.yml b/contrib/snowball/.travis.yml new file mode 100644 index 0000000..e576233 --- /dev/null +++ b/contrib/snowball/.travis.yml @@ -0,0 +1,4 @@ +language: c +compiler: gcc +before_script: git clone https://github.com/snowballstem/snowball-data ../data +script: make check diff --git a/contrib/snowball/AUTHORS b/contrib/snowball/AUTHORS new file mode 100644 index 0000000..60eae6f --- /dev/null +++ b/contrib/snowball/AUTHORS @@ -0,0 +1,27 @@ +Authors +======= + +Martin Porter +------------- + + - Designed the snowball language. + - Implemented the snowball to C compiler. + - Implemented the stemming algorithms in C. + - Wrote the documentation. + +Richard Boulton +--------------- + + - Implemented Java backend of the snowball compiler. + - Developed build system. + - Assisted with website maintenance. + + +Assistance from +--------------- + +Olivier Bornet - fixes to java packaging and build system. +Andreas Jung - useful bug reports on the libstemmer library. +Olly Betts - several patches, bug reports, and performance improvements. +Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms. +Ralf Junker - fix a potential memory leak in sb_stemmer_new(). diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt new file mode 100644 index 0000000..7ee961e --- /dev/null +++ b/contrib/snowball/CMakeLists.txt @@ -0,0 +1,70 @@ +# End of configuration +SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian + indonesian italian lithuanian nepali norwegian porter portuguese romanian + russian serbian spanish swedish tamil turkish) +SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS}) + +SET(COMPILER_SOURCES compiler/space.c + compiler/tokeniser.c + compiler/analyser.c + compiler/generator.c + compiler/driver.c) + +SET(SNOWBALL_RUNTIME runtime/api.c + runtime/utilities.c) +SET(LIBSTEMMER_SOURCES libstemmer/libstemmer.c) +SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c) +#LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c +#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h +#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in + +SET(MODULES_H "modules.h") +CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY) +ADD_DEFINITIONS("-DDISABLE_JS") +ADD_DEFINITIONS("-DDISABLE_GO") +ADD_DEFINITIONS("-DDISABLE_JAVA") +ADD_DEFINITIONS("-DDISABLE_PYTHON") +ADD_DEFINITIONS("-DDISABLE_CSHARP") +ADD_DEFINITIONS("-DDISABLE_PASCAL") +ADD_DEFINITIONS("-DDISABLE_RUST") + +MACRO(gen_stem IN ENCODING) + FOREACH(_it ${IN}) + SET(_base "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/stem_${ENCODING}_${_it}") + SET(_header "${_base}.h") + SET(_source "${_base}.c") + STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}") + SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl") + IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input}) + ADD_CUSTOM_COMMAND(OUTPUT ${_source} + COMMAND env "ASAN_OPTIONS=detect_leaks=0" ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u + DEPENDS snowball) + LIST(APPEND STEMMER_SOURCES ${_source}) + + ELSE() + IF(EXISTS "${_input}") + ADD_CUSTOM_COMMAND(OUTPUT ${_source} + COMMAND env "ASAN_OPTIONS=detect_leaks=0" ${CMAKE_CURRENT_BINARY_DIR}/snowball ${_input} -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u + DEPENDS snowball) + LIST(APPEND STEMMER_SOURCES ${_source}) + ENDIF() + ENDIF() + ENDFOREACH() +ENDMACRO() + +INCLUDE_DIRECTORIES("include") + +ADD_EXECUTABLE(snowball ${COMPILER_SOURCES}) + +ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h + COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak) +ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h") + +SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c") +ADD_CUSTOM_TARGET(stemmer_deps ALL) +ADD_DEPENDENCIES(stemmer_deps modules) + +gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8") + +ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES}) +ADD_DEPENDENCIES(stemmer stemmer_deps) diff --git a/contrib/snowball/NEWS b/contrib/snowball/NEWS new file mode 100644 index 0000000..c71c12d --- /dev/null +++ b/contrib/snowball/NEWS @@ -0,0 +1,407 @@ +Snowball 2.0.0 (2019-10-02) +=========================== + +C/C++ +----- + +* Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled + sequences of any length, but commands which look at the character value only + handled sequences up to length 3. Fixes #89. + +* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`. + +Java +---- + +* TestApp.java: + + - Always use UTF-8 for I/O. Patch from David Corbett (#80). + + - Allow reading input from stdin. + + - Remove rather pointless "stem n times" feature. + + - Only lower case ASCII to match stemwords.c. + + - Stem empty lines too to match stemwords.c. + +Code Quality Improvements +------------------------- + +* Fix various warnings from newer compilers. + +* Improve use of `const`. + +* Share common functions between compiler backends rather than having multiple + copies of the same code. + +* Assorted code clean-up. + +* Initialise line_labelled member of struct generator to 0. Previously we were + invoking undefined behaviour, though in practice it'll be zero initialised on + most platforms. + +New Code Generators +------------------- + +* Add Python generator (#24). Originally written by Yoshiki Shibukawa, with + additional updates by Dmitry Shachnev. + +* Add Javascript generator. Based on JSX generator (#26) written by Yoshiki + Shibukawa. + +* Add Rust generator from Jakob Demler (#51). + +* Add Go generator from Marty Schoch (#57). + +* Add C# generator. Based on patch from Cesar Souza (#16, #17). + +* Add Pascal generator. Based on Delphi backend from stemming.zip file on old + website (#75). + +New Language Features +--------------------- + +* Add `len` and `lenof` to measure Unicode length. These are similar to `size` + and `sizeof` (respectively), but `size` and `sizeof` return the length in + bytes under `-utf8`, whereas these new commands give the same result whether + using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in + the length of the string). For compatibility with existing code which might + use these as variable or function names, they stop being treated as tokens if + declared to be a variable or function. + +* New `{U+1234}` stringdef notation for Unicode codepoints. + +* More versatile integer tests. Now you can compare any two arithmetic + expressions with a relational operator in parentheses after the `$`, so for + example `$(len > 3)` can now be used when previously a temporary variable was + required: `$tmp = len $tmp > 3` + +Code generation improvements +---------------------------- + +* General: + + + Avoid unnecessarily saving and restoring of the cursor for more commands - + `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always + restore its value, and for C `booltest` (which other languages already + handled). + + + Special case handling for `setlimit tomark AE`. All uses of setlimit in + the current stemmers we ship follow this pattern, and by special-casing we + can avoid having to save and restore the cursor (#74). + + + Merge duplicate actions in the same `among`. This reduces the size of the + switch/if-chain in the generated code which dispatch the among for many of + the stemmers. + + + Generate simpler code for `among`. We always check for a zero return value + when we call the among, so there's no point also checking for that in the + switch/if-chain. We can also avoid the switch/if-chain entirely when + there's only one possible outcome (besides the zero return). + + + Optimise code generated for `do <function call>`. This speeds up "make + check_python" by about 2%, and should speed up other interpreted languages + too (#110). + + + Generate more and better comments referencing snowball source. + + + Add homepage URL and compiler version as comments in generated files. + +* C/C++: + + + Fix `size` and `sizeof` to not report one too high (reported by Assem + Chelli in #32). + + + If signal `f` from a function call would lead to return from the current + function then handle this and bailing out on an error together with a + simple `if (ret <= 0) return ret;` + + + Inline testing for a single character literals. + + + Avoiding generating `|| 0` in corner case - this can result in a compiler + warning when building the generated code. + + + Implement `insert_v()` in terms of `insert_s()`. + + + Add conditional `extern "C"` so `runtime/api.h` can be included from C++ + code. Closes #90, reported by vvarma. + +* Java: + + + Fix functions in `among` to work in Java. We seem to need to make the + methods called from among `public` instead of `private`, and to call them + on `this` instead of the `methodObject` (which is cleaner anyway). No + revision in version control seems to generate working code for this case, + but Richard says it definitely used to work - possibly older JVMs failed to + correctly enforce the access controls when methods were invoked by + reflection. + + + Code after handling `f` by returning from the current function is + unreachable too. + + + Previously we incorrectly decided that code after an `or` was + unreachable in certain cases. None of the current stemmers in the + distribution triggered this, but Martin Porter's snowball version + of the Schinke Latin stemmer does. Fixes #58, reported by Alexander + Myltsev. + + + The reachability logic was failing to consider reachability from + the final command in an `or`. Fixes #82, reported by David Corbett. + + + Fix `maxint` and `minint`. Patch from David Corbett in #31. + + + Fix `$` on strings. The previous generated code was just wrong. This + doesn't affect any of the included algorithms, but for example breaks + Martin Porter's snowball implementation of Schinke's Latin Stemmer. + Issue noted by Jakob Demler while working on the Rust backend in #51, + and reported in the Schinke's Latin Stemmer by Alexander Myltsev + in #58. + + + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43. + + + Eliminate range-check implementation for groupings. This was removed from + the C generator 10 years earlier, isn't used for any of the existing + algorithms, and it doesn't seem likely it would be - the grouping would + have to consist entirely of a contiguous block of Unicode code-points. + + + Simplify code generated for `repeat` and `atleast`. + + + Eliminate unused return values and variables from runtime functions. + + + Only import the `among` and `SnowballProgram` classes if they're actually + used. + + + Only generate `copy_from()` method if it's used. + + + Merge runtime functions `eq_s` and `eq_v` functions. + + + Java arrays know their own length so stop storing it separately. + + + Escape char 127 (DEL) in generated Java code. It's unlikely that this + character would actually be used in a real stemmer, so this was more of a + theoretical bug. + + + Drop unused import of InvocationTargetException from SnowballStemmer. + Reported by GerritDeMeulder in #72. + + + Fix lint check issues in generated Java code. The stemmer classes are only + referenced in the example app via reflection, so add + @SuppressWarnings("unused") for them. The stemmer classes override + equals() and hashCode() methods from the standard java Object class, so + mark these with @Override. Both suggested by GerritDeMeulder in #72. + + + Declare Java variables at point of use in generated code. Putting all + declarations at the top of the function was adding unnecessary complexity + to the Java generator code for no benefit. + + + Improve formatting of generated code. + +New stemming algorithms +----------------------- + +* Add Tamil stemmer from Damodharan Rajalingam (#2, #3). + +* Add Arabic stemmer from Assem Chelli (#32, #50). + +* Add Irish stemmer Jim O'Regan (#48). + +* Add Nepali stemmer from Arthur Zakirov (#70). + +* Add Indonesian stemmer from Olly Betts (#71). + +* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review. + +* Add Lithuanian stemmer from Dainius Jocas (#22, #76). + +* Add Greek stemmer from Oleg Smirnov (#44). + +* Add Catalan and Basque stemmers from Israel Olalla (#104). + +Behavioural changes to existing algorithms +------------------------------------------ + +* Portuguese: + + + Replace incorrect Spanish suffixes by Portuguese suffixes (#1). + +* French: + + + The MSDOS CP850 version of the French algorithm was missing changes present + in the ISO8859-1 and Unicode versions. There's now a single version of + each algorithm which was based on the Unicode version. + + + Recognize French suffixes even when they begin with diaereses. Patch from + David Corbett in #78. + +* Russian: + + + We now normalise 'ё' to 'е' before stemming. The documentation has long + said "we assume ['ё'] is mapped into ['е']" but it's more convenient for + the stemmer to actually perform this normalisation. This change has no + effect if the caller is already normalising as we recommend. It's a change + in behaviour they aren't, but 'ё' occurs rarely (there are currently no + instances in our test vocabulary) and this improves behaviour when it does + occur. Patch from Eugene Mirotin (#65, #68). + +* Finish: + + + Adjust the Finnish algorithm not to mangle numbers. This change also + means it tends to leave foreign words alone. Fixes #66. + +* Danish: + + + Adjust Danish algorithm not to mangle alphanumeric codes. In particular + alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000, + space1999) are no longer mangled. See #81. + +Optimisations to existing algorithms +------------------------------------ + +* Turkish: + + + Simplify uses of `test` in stemmer code. + + + Check for 'ad' or 'soyad' more efficiently, and without needing the + strlen variable. This speeds up "make check_utf8_turkish" by 11% + on x86 Linux. + +* Kraaij-Pohlmann: + + + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient + than `setmark x $x >= p1`. + +Code clarity improvements to existing algorithms +------------------------------------------------ + +* Turkish: + + + Use , for cedilla to match the conventions used in other stemmers. + +* Kraaij-Pohlmann: + + + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same + `[substring] among (` ... `)` construct we do in other stemmers. + +Compiler +-------- + +* Support conventional --help and --version options. + +* Warn if -r or -ep used with backend other than C/C++. + +* Warn if encoding command line options are specified when generating code in a + language with a fixed encoding. + +* The default classname is now set based on the output filename, so `-n` is now + often no longer needed. Fixes #64. + +* Avoid potential one byte buffer over-read when parsing snowball code. + +* Avoid comparing with uninitialised array element during compilation. + +* Improve `-syntax` output for `setlimit L for C`. + +* Optimise away double negation so generators don't have to worry about + generating `--` (decrement operator in many languages). Fixes #52, reported + by David Corbett. + +* Improved compiler error and warning messages: + + - We now report FILE:LINE: before each diagnostic message. + + - Improve warnings for unused declarations/definitions. + + - Warn for variables which are used, but either never initialised + or never read. + + - Flag non-ASCII literal strings. This is an error for wide Unicode, but + only a warning for single-byte and UTF-8 which work so long as the source + encoding matches the encoding used in the generated stemmer code. + + - Improve error recovery after an undeclared `define`. We now sniff the + token after the identifier and if it is `as` we parse as a routine, + otherwise we parse as a grouping. Previously we always just assumed it was + a routine, which gave a confusing second error if it was a grouping. + + - Improve error recovery after an unexpected token in `among`. Previously + we acted as if the unexpected token closed the `among` (this probably + wasn't intended but just a missing `break;` in a switch statement). Now we + issue an error and try the next token. + +* Report error instead of silently truncating character values (e.g. `hex 123` + previously silently became byte 0x23 which is `#` rather than a + g-with-cedilla). + +* Enlarge the initial input buffer size to 8192 bytes and double each time we + hit the end. Snowball programs are typically a few KB in size (with the + current largest we ship being the Greek stemmer at 27KB) so the previous + approach of starting with a 10 byte input buffer and increasing its size by + 50% plus 40 bytes each time it filled was inefficient, needing up to 15 + reallocations to load greek.sbl. + +* Identify variables only used by one `routine`/`external`. This information + isn't yet used, but such variables which are also always written to before + being read can be emitted as local variables in most target languages. + +* We now allow multiple source files on command line, and allow them to be + after (or even interspersed) with options to better match modern Unix + conventions. Support for multiple source files allows specifying a single + byte character set mapping via a source file of `stringdef`. + +* Avoid infinite recursion in compiler when optimising a recursive snowball + function. Recursive functions aren't typical in snowball programs, but + the compiler shouldn't crash for any input, especially not a valid one. + We now simply limit on how deep the compiler will recurse and make the + pessimistic assumption in the unlikely event we hit this limit. + +Build system: + +* `make clean` in C libstemmer_c distribution now removes `examples/*.o`. + (#59) + +* Fix all the places which previously had to have a list of stemmers to work + dynamically or be generated, so now only modules.txt needs updating to add + a new stemmer. + +* Add check_java make target which runs tests for java. + +* Support gzipped test data (the uncompressed arabic test data is too big for + github). + +* GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball + invocations for Java - these are only meaningful when generating C code. + +* Pass CFLAGS when linking which matches convention (e.g. automake does it) and + facilitates use of tools such as ASan. Fixes #84, reported by Thomas + Pointhuber. + +* Add CI builds with -std=c90 to check compiler and generated code are C90 + (#54) + +libstemmer stuff: + +* Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords. + +* Add -O2 to CFLAGS. + +* Make generated tables of encodings and modules const. + +* Fix clang static analyzer memory leak warning (in practice this code path + can never actually be taken). Patch from Patrick O. Perry (#56) + +documentation + +* Added copyright and licensing details (#10). + +* Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian + and romanian are available in ISO_8859_2. + +* Remove documentation falsely claiming that libstemmer supports CP850 + encoding. + +* CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and + new language backends. + +* Overhaul libstemmer_python_README. Most notably, replace the benchmark data + which was very out of date. diff --git a/contrib/snowball/README b/contrib/snowball/README new file mode 100644 index 0000000..afb51b3 --- /dev/null +++ b/contrib/snowball/README @@ -0,0 +1,5 @@ +This contains the source code for the snowball compiler and the stemming +algorithms on the website. + +See http://snowball.tartarus.org/ for more details. + diff --git a/contrib/snowball/algorithms/arabic.sbl b/contrib/snowball/algorithms/arabic.sbl new file mode 100644 index 0000000..d827ee7 --- /dev/null +++ b/contrib/snowball/algorithms/arabic.sbl @@ -0,0 +1,561 @@ +/* + * Authors: + * - Assem Chelli, < assem [dot] ch [at] gmail > + * - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz> + * +*/ + +stringescapes { } + +/* the Arabic letters in Unicode */ +// Hamza +stringdef o '{U+0621}' // Hamza +stringdef ao '{U+0623}' // Hamza above Alef +stringdef ao_ '{U+0625}' // Hamza below Alef +stringdef a~ '{U+0622}' // Alef madda +stringdef wo '{U+0624}' // Hamza above waw +stringdef yo '{U+0626}' // Hamza above yeh + +// Letters +stringdef a '{U+0627}' // Alef +stringdef a_ '{U+0649}' // Alef Maksura +stringdef b '{U+0628}' // Beh +stringdef t_ '{U+0629}' // Teh_Marbuta +stringdef t '{U+062A}' // Teh +stringdef th '{U+062B}' // Theh +stringdef j '{U+062C}' // Jeem +stringdef h '{U+062D}' // Hah +stringdef x '{U+062E}' // Khah +stringdef d '{U+062F}' // Dal +stringdef dz '{U+0630}' // Thal +stringdef r '{U+0631}' // Reh +stringdef z '{U+0632}' // Zain +stringdef s '{U+0633}' // Seen +stringdef sh '{U+0634}' // Sheen +stringdef c '{U+0635}' // Sad +stringdef dh '{U+0636}' // Dad +stringdef tt '{U+0637}' // Tah +stringdef zh '{U+0638}' // Zah +stringdef i '{U+0639}' // Ain +stringdef gh '{U+063A}' // Ghain +stringdef f '{U+0641}' // Feh +stringdef q '{U+0642}' // Qaf +stringdef k '{U+0643}' // Kaf +stringdef l '{U+0644}' // Lam +stringdef m '{U+0645}' // Meem +stringdef n '{U+0646}' // Noon +stringdef e '{U+0647}' // Heh +stringdef w '{U+0648}' // Waw +stringdef y '{U+064A}' // Yeh + +// Diacritics +stringdef aan '{U+064B}' // FatHatan +stringdef uun '{U+064C}' // Dammatan +stringdef iin '{U+064D}' // Kasratan +stringdef aa '{U+064E}' // FatHa +stringdef uu '{U+064F}' // Damma +stringdef ii '{U+0650}' // Kasra +stringdef oo '{U+0652}' // Sukun +stringdef ~ '{U+0651}' // Shadda + +// Hindu–Arabic numerals +stringdef 0 '{U+0660}' +stringdef 1 '{U+0661}' +stringdef 2 '{U+0662}' +stringdef 3 '{U+0663}' +stringdef 4 '{U+0664}' +stringdef 5 '{U+0665}' +stringdef 6 '{U+0666}' +stringdef 7 '{U+0667}' +stringdef 8 '{U+0668}' +stringdef 9 '{U+0669}' + + +// Kasheeda +stringdef _ '{U+0640}' // Kasheeda, Tatweel + +// Shaped forms +stringdef o1 '{U+FE80}' // HAMZA +stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE +stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE +stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW +stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW +stringdef yo1 '{U+FE8B}' // YEH_HAMZA +stringdef yo2 '{U+FE8C}' // YEH_HAMZA +stringdef yo3 '{U+FE89}' // YEH_HAMZA +stringdef yo4 '{U+FE8A}' // YEH_HAMZA +stringdef a~1 '{U+FE81}' // ALEF_MADDA +stringdef a~2 '{U+FE82}' // ALEF_MADDA +stringdef wo1 '{U+FE85}' // WAW_HAMZA +stringdef wo2 '{U+FE86}' // WAW_HAMZA +stringdef a1 '{U+FE8D}' // ALEF +stringdef a2 '{U+FE8E}' // ALEF +stringdef b1 '{U+FE8F}' // BEH +stringdef b2 '{U+FE90}' // BEH +stringdef b3 '{U+FE91}' // BEH +stringdef b4 '{U+FE92}' // BEH +stringdef t_1 '{U+FE93}' // TEH_MARBUTA +stringdef t_2 '{U+FE94}' // TEH_MARBUTA +stringdef t1 '{U+FE97}' // TEH +stringdef t2 '{U+FE98}' // TEH +stringdef t3 '{U+FE95}' // TEH +stringdef t4 '{U+FE96}' // TEH +stringdef th1 '{U+FE9B}' // THEH +stringdef th2 '{U+FE9C}' // THEH +stringdef th3 '{U+FE9A}' // THEH +stringdef th4 '{U+FE99}' // THEH +stringdef j1 '{U+FE9F}' // JEEM +stringdef j2 '{U+FEA0}' // JEEM +stringdef j3 '{U+FE9D}' // JEEM +stringdef j4 '{U+FE9E}' // JEEM +stringdef h1 '{U+FEA3}' // HAH +stringdef h2 '{U+FEA4}' // HAH +stringdef h3 '{U+FEA1}' // HAH +stringdef h4 '{U+FEA2}' // HAH +stringdef x1 '{U+FEA7}' // KHAH +stringdef x2 '{U+FEA8}' // KHAH +stringdef x3 '{U+FEA5}' // KHAH +stringdef x4 '{U+FEA6}' // KHAH +stringdef d1 '{U+FEA9}' // DAL +stringdef d2 '{U+FEAA}' // DAL +stringdef dz1 '{U+FEAB}' // THAL +stringdef dz2 '{U+FEAC}' // THAL +stringdef r1 '{U+FEAD}' // REH +stringdef r2 '{U+FEAE}' // REH +stringdef z1 '{U+FEAF}' // ZAIN +stringdef z2 '{U+FEB0}' // ZAIN +stringdef s1 '{U+FEB3}' // SEEN +stringdef s2 '{U+FEB4}' // SEEN +stringdef s3 '{U+FEB1}' // SEEN +stringdef s4 '{U+FEB2}' // SEEN +stringdef sh1 '{U+FEB7}' // SHEEN +stringdef sh2 '{U+FEB8}' // SHEEN +stringdef sh3 '{U+FEB5}' // SHEEN +stringdef sh4 '{U+FEB6}' // SHEEN +stringdef c1 '{U+FEBB}' // SAD +stringdef c2 '{U+FEBC}' // SAD +stringdef c3 '{U+FEB9}' // SAD +stringdef c4 '{U+FEBA}' // SAD +stringdef dh1 '{U+FEBF}' // DAD +stringdef dh2 '{U+FEC0}' // DAD +stringdef dh3 '{U+FEBD}' // DAD +stringdef dh4 '{U+FEBE}' // DAD +stringdef tt1 '{U+FEC3}' // TAH +stringdef tt2 '{U+FEC4}' // TAH +stringdef tt3 '{U+FEC1}' // TAH +stringdef tt4 '{U+FEC2}' // TAH +stringdef zh1 '{U+FEC7}' // ZAH +stringdef zh2 '{U+FEC8}' // ZAH +stringdef zh3 '{U+FEC5}' // ZAH +stringdef zh4 '{U+FEC6}' // ZAH +stringdef i1 '{U+FECB}' // AIN +stringdef i2 '{U+FECC}' // AIN +stringdef i3 '{U+FEC9}' // AIN +stringdef i4 '{U+FECA}' // AIN +stringdef gh1 '{U+FECF}' // GHAIN +stringdef gh2 '{U+FED0}' // GHAIN +stringdef gh3 '{U+FECD}' // GHAIN +stringdef gh4 '{U+FECE}' // GHAIN +stringdef f1 '{U+FED3}' // FEH +stringdef f2 '{U+FED4}' // FEH +stringdef f3 '{U+FED1}' // FEH +stringdef f4 '{U+FED2}' // FEH +stringdef q1 '{U+FED7}' // QAF +stringdef q2 '{U+FED8}' // QAF +stringdef q3 '{U+FED5}' // QAF +stringdef q4 '{U+FED6}' // QAF +stringdef k1 '{U+FEDB}' // KAF +stringdef k2 '{U+FEDC}' // KAF +stringdef k3 '{U+FED9}' // KAF +stringdef k4 '{U+FEDA}' // KAF +stringdef l1 '{U+FEDF}' // LAM +stringdef l2 '{U+FEE0}' // LAM +stringdef l3 '{U+FEDD}' // LAM +stringdef l4 '{U+FEDE}' // LAM +stringdef m1 '{U+FEE3}' // MEEM +stringdef m2 '{U+FEE4}' // MEEM +stringdef m3 '{U+FEE1}' // MEEM +stringdef m4 '{U+FEE2}' // MEEM +stringdef n1 '{U+FEE7}' // NOON +stringdef n2 '{U+FEE8}' // NOON +stringdef n3 '{U+FEE5}' // NOON +stringdef n4 '{U+FEE6}' // NOON +stringdef e1 '{U+FEEB}' // HEH +stringdef e2 '{U+FEEC}' // HEH +stringdef e3 '{U+FEE9}' // HEH +stringdef e4 '{U+FEEA}' // HEH +stringdef w1 '{U+FEED}' // WAW +stringdef w2 '{U+FEEE}' // WAW +stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA +stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA +stringdef y1 '{U+FEF3}' // YEH +stringdef y2 '{U+FEF4}' // YEH +stringdef y3 '{U+FEF1}' // YEH +stringdef y4 '{U+FEF2}' // YEH + +// Ligatures Lam-Alef +stringdef la '{U+FEFB}' // LAM_ALEF +stringdef la2 '{U+FEFC}' // LAM_ALEF +stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE +stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE +stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW +stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW +stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE +stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE + + +booleans ( + is_noun + is_verb + is_defined + ) + +routines ( + Prefix_Step1 + Prefix_Step2 + Prefix_Step3a_Noun + Prefix_Step3b_Noun + Prefix_Step3_Verb + Prefix_Step4_Verb + + Suffix_All_alef_maqsura + Suffix_Noun_Step1a + Suffix_Noun_Step1b + Suffix_Noun_Step2a + Suffix_Noun_Step2b + Suffix_Noun_Step2c1 + Suffix_Noun_Step2c2 + Suffix_Noun_Step3 + Suffix_Verb_Step1 + Suffix_Verb_Step2a + Suffix_Verb_Step2b + Suffix_Verb_Step2c + + Normalize_post + Normalize_pre + + Checks1 +) + +externals ( stem ) + +groupings ( ) + + +// Normalizations +define Normalize_pre as ( + do repeat ( + ( + [substring] among ( + '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization + '{_}' ( delete ) // strip kasheeda + + // Hindu–Arabic numerals + '{0}' ( <- '0') + '{1}' ( <- '1') + '{2}' ( <- '2') + '{3}' ( <- '3') + '{4}' ( <- '4') + '{5}' ( <- '5') + '{6}' ( <- '6') + '{7}' ( <- '7') + '{8}' ( <- '8') + '{9}' ( <- '9') + + // Shaped forms + '{o1}' ( <- '{o}' ) // HAMZA + '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE + '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW + '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA + '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA + '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA + '{a1}' '{a2}' ( <- '{a}' ) // ALEF + '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH + '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA + '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH + '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH + '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM + '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH + '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH + '{d1}' '{d2}' ( <- '{d}' ) // DAL + '{dz1}''{dz2}' ( <- '{dz}' ) // THAL + '{r1}' '{r2}'( <- '{r}' ) // REH + '{z1}' '{z2}' ( <- '{z}' ) // ZAIN + '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN + '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN + '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD + '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD + '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH + '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH + '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN + '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN + '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH + '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF + '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF + '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM + '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM + '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON + '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH + '{w1}' '{w2}' ( <- '{w}' ) // WAW + '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA + '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH + + // Ligatures Lam-Alef + '{la}' '{la2}' (<- '{l}{a}') + '{lao}' '{lao2}' (<- '{l}{ao}') + '{lao_}' '{lao_2}' (<- '{l}{ao_}') + '{la~}' '{la~2}' (<- '{l}{a~}') + + ) + ) + or + next + ) +) + +define Normalize_post as ( + + do ( + // normalize last hamza + backwards ( + [substring] among ( + '{ao}''{ao_}' '{a~}' ( <- '{o}') + '{wo}' ( <- '{o}') + '{yo}' ( <- '{o}') + ) + ) + ) + + do repeat ( + ( + // normalize other hamza's + [substring] among ( + '{ao}''{ao_}' '{a~}' ( <- '{a}') + '{wo}' ( <- '{w}') + '{yo}' ( <- '{y}') + ) + ) + or + next + ) +) + +// Checks +define Checks1 as ( + [substring] among ( + '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined) + '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined) + ) +) + + +//prefixes +define Prefix_Step1 as ( + [substring] among ( + '{ao}{ao}' ($(len > 3) <- '{ao}' ) + '{ao}{a~}' ($(len > 3) <- '{a~}' ) + '{ao}{wo}' ($(len > 3) <- '{ao}' ) + '{ao}{a}' ($(len > 3) <- '{a}' ) + '{ao}{ao_}' ($(len > 3) <- '{ao_}' ) + // '{ao}' ($(len > 3) delete) //rare case + ) +) + +define Prefix_Step2 as ( + not '{f}{a}' + not '{w}{a}' + [substring] among ( + '{f}' ($(len > 3) delete) + '{w}' ($(len > 3) delete) + ) +) + +define Prefix_Step3a_Noun as ( // it is noun and defined + [substring] among ( + '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete) + '{l}{l}' '{a}{l}' ($(len > 4) delete) + ) +) + +define Prefix_Step3b_Noun as ( // probably noun and defined + not '{b}{a}' // exception + [substring] among ( + '{b}' ($(len > 3) delete) + // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion + '{b}{b}' ($(len > 3) <- '{b}' ) + '{k}{k}' ($(len > 3) <- '{k}' ) + ) + +) + +define Prefix_Step3_Verb as ( + [substring] among ( + //'{s}' ($(len > 4) delete)// BUG: cause confusion + '{s}{y}' ($(len > 4) <- '{y}' ) + '{s}{t}' ($(len > 4) <- '{t}') + '{s}{n}' ($(len > 4) <- '{n}') + '{s}{ao}' ($(len > 4) <- '{ao}') + ) +) + +define Prefix_Step4_Verb as ( + [substring] among ( + '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' ) + ) +) + +// suffixes +backwardmode ( + + define Suffix_Noun_Step1a as ( + [substring] among ( + '{y}' '{k}' '{e}' ($(len >= 4) delete) + '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete) + '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete) + ) + ) + define Suffix_Noun_Step1b as ( + [substring] among ( + '{n}' ($(len > 5) delete) + ) + ) + + define Suffix_Noun_Step2a as ( + [substring] among ( + '{a}' '{y}' '{w}' ($(len > 4) delete) + ) + ) + + define Suffix_Noun_Step2b as ( + [substring] among ( + '{a}{t}' ($(len >= 5) delete) + ) + ) + + define Suffix_Noun_Step2c1 as ( + [substring] among ( + '{t}' ($(len >= 4) delete) + ) + ) + define Suffix_Noun_Step2c2 as ( // feminine t_ + [substring] among ( + '{t_}' ($(len >= 4) delete) + ) + ) + define Suffix_Noun_Step3 as ( // ya' nisbiya + [substring] among ( + '{y}' ($(len >= 3) delete) + ) + ) + + define Suffix_Verb_Step1 as ( + [substring] among ( + '{e}' '{k}' ($(len >= 4) delete) + '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete) + '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete) + ) + ) + define Suffix_Verb_Step2a as ( + [substring] among ( + '{t}' ($(len >= 4) delete) + '{a}' '{n}' '{y}' ($(len >= 4) delete) + '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past + '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present + '{t}{m}{a}' ($(len >= 6) delete) + ) + ) + + define Suffix_Verb_Step2b as ( + [substring] among ( + '{w}{a}' '{t}{m}' ($(len >= 5) delete) + ) + ) + + + define Suffix_Verb_Step2c as ( + [substring] among ( + '{w}' ($(len >= 4) delete) + '{t}{m}{w}' ($(len >= 6) delete) + ) + ) + + define Suffix_All_alef_maqsura as ( + [substring] among ( + '{a_}' ( <- '{y}' ) // spell error + // '{a_}' ( delete ) // if noun > 3 + // '{a_}' ( <- '{a}') // if verb + ) + ) +) + +define stem as ( + // set initial values + set is_noun + set is_verb + unset is_defined + + // guess type and properties + do Checks1 + + // normalization pre-stemming + do Normalize_pre + + + backwards ( + + do ( + //Suffixes for verbs + ( + is_verb + ( + ( + (atleast 1 Suffix_Verb_Step1) + ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) + ) + or Suffix_Verb_Step2b + or Suffix_Verb_Step2a + ) + ) + //Suffixes for nouns + or ( + is_noun + ( + + try ( + Suffix_Noun_Step2c2 + or (not is_defined Suffix_Noun_Step1a ( + Suffix_Noun_Step2a + or Suffix_Noun_Step2b + or Suffix_Noun_Step2c1 + or next)) + or (Suffix_Noun_Step1b ( + Suffix_Noun_Step2a + or Suffix_Noun_Step2b + or Suffix_Noun_Step2c1)) + or (not is_defined Suffix_Noun_Step2a) + or (Suffix_Noun_Step2b) + ) + Suffix_Noun_Step3 + ) + + ) + + // Suffixes for alef maqsura + or Suffix_All_alef_maqsura + ) + ) + + //Prefixes + do ( + try Prefix_Step1 + try Prefix_Step2 + ( Prefix_Step3a_Noun + or (is_noun Prefix_Step3b_Noun) + or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) + ) + ) + + // normalization post-stemming + do Normalize_post + +) diff --git a/contrib/snowball/algorithms/basque.sbl b/contrib/snowball/algorithms/basque.sbl new file mode 100644 index 0000000..267abc7 --- /dev/null +++ b/contrib/snowball/algorithms/basque.sbl @@ -0,0 +1,149 @@ +routines ( + aditzak + izenak + adjetiboak + mark_regions + RV R2 R1 +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef n~ '{U+00F1}' + +define v 'aeiou' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R2 as $p2 <= cursor + define R1 as $p1 <= cursor + + define aditzak as ( + [substring] among( + 'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea' + 'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza' + 'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza' + 'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez' + 'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea' + 'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena' + 'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea' + 'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari' + 'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu' + 'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako' + ( RV delete ) + 'garri' 'garria' 'tza' + (R2 delete) + 'atseden' + (<- 'atseden') + 'arabera' + (<- 'arabera') + 'baditu' + (<- 'baditu') + + ) + ) + + define izenak as ( + [substring] among( + 'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina' + 'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea' + 'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua' + 'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di' + 'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa' + 'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia' + 'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia' + 'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua' + 'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara' + 'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge' + 'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua' + 'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia' + 'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde' + 'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea' + 'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea' + 'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia' + 'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa' + 'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa' + 'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila' + 'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa' + 'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia' + 'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena' + 'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan' + 'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek' + 'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara' + 'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket' + 'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko' + 'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera' + 'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko' + ( RV delete ) + 'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza' + ( R2 delete ) + 'joka' + (<- 'jok') + 'tzen' 'ten' 'en' 'tatu' + (R1 delete) + 'trako' + (<- 'tra') + 'minutuko' + (<- 'minutu') + 'zehar' + (<- 'zehar') + 'geldi' + (<- 'geldi') + 'igaro' + (<- 'igaro') + 'aurka' + (<- 'aurka') + ) + ) + + define adjetiboak as ( + [substring] among( + 'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria' + 'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik' + ( RV delete ) + 'zlea' + (<- 'z') + ) + ) + +) + +define stem as ( + do mark_regions + backwards ( + repeat aditzak + repeat izenak + do adjetiboak + ) + +) + +/* + Note 1: additions of 21 Jul 2010 +*/ diff --git a/contrib/snowball/algorithms/catalan.sbl b/contrib/snowball/algorithms/catalan.sbl new file mode 100644 index 0000000..0a1e3a5 --- /dev/null +++ b/contrib/snowball/algorithms/catalan.sbl @@ -0,0 +1,202 @@ +routines ( + cleaning mark_regions + R1 R2 + attached_pronoun + standard_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef a` '{U+00E0}' // a-grave +stringdef c, '{U+00E7}' // c-cedilla +stringdef e' '{U+00E9}' // e-acute +stringdef e` '{U+00E8}' // e-grave +stringdef i' '{U+00ED}' // i-acute +stringdef i` '{U+00EC}' // i-grave +stringdef i" '{U+00EF}' // i-diaeresis +stringdef o' '{U+00F3}' // o-acute +stringdef o` '{U+00F2}' // o-grave +stringdef u' '{U+00FA}' // u-acute +stringdef u" '{U+00FC}' // u-diaeresis +stringdef . '{U+00B7}' // - per l aggeminades + +define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' + +define mark_regions as ( + + $p1 = limit + $p2 = limit // defaults + + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define cleaning as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{a`}' (<- 'a') + '{e'}' (<- 'e') + '{e`}' (<- 'e') + '{i'}' (<- 'i') + '{i`}' (<- 'i') + '{o'}' (<- 'o') + '{o`}' (<- 'o') + '{u'}' (<- 'u') + '{u"}' (<- 'u') + '{i"}' (<- 'i') + '{.}' (<- '.') + '' (next) + ) +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among ( + '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' + '-ls' '-la' '-les' '-li' + 'vos' 'se' 'nos' '-nos' '-us' 'us' + '{'}n' '{'}ns' '-n' '-ns' + '{'}m' '-me' '-m' + '-te' '{'}t' + 'li' 'lo' 'los' + 'me' 'sela' 'selo' 'selas' 'selos' 'le' + 'la' 'las' 'les' 'ens' 'ho' 'hi' + (R1 delete) + ) + ) + + define standard_suffix as ( + [substring] among( + 'ar' 'atge' 'formes' 'icte' 'ictes' + 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' + 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' + 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' + 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' + 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' + '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all' + 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' + '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' + 'itar' 'ables' 'adors' 'idores' 'idors' + 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' + 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' + 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' + 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' + 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' + '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' + 'assa' 'asses' 'assos' + 'ent' 'ents' + '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' + 'ims' 'ima' 'imes' + 'isme' 'ista' 'ismes' 'istes' + 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' + 'oses' 'osos' 'ient' 'otes' 'ots' + (R1 delete) + 'acions' 'ada' 'ades' + (R2 delete) + 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' + (R2 <- 'log') + 'ic' 'ica' 'ics' 'iques' + (R2 <- 'ic') + 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' + (R1 <- 'c') + ) + ) + + define verb_suffix as ( + [substring] among( + 'ador' 'adora' 'adors' 'adores' 're' 'ie' + 'ent' 'ents' 'udes' 'ar{a`}' 'eren' + 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'aria' 'arian' 'arien' 'aries' 'ar{a`}s' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' + 'ar{e'}' 'ar{e'}s' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' 'er' 'erau' 'erass' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' + 'ia' 'ies' '{i'}em' '{i`}eu' 'ien' + 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' + 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' + 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' + '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' + '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' + 'ar{i'}em' 'ar{i'}eu' + 'areu' 'aren' 'ant' '{i"}m' '{i"}u' + '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' + 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' + 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' + 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' + 'ieu' 'ii' 'io' 'i{a`}' + 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' + 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' + 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' + 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' + 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' + 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' + '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' + '{i"}ra' '{i"}ren' '{i"}res' + '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' + 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' + 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' + (R1 delete) + 'ando' + (R2 delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' + 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' + (R1 delete) + 'iqu' + (R1 <- 'ic') + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + verb_suffix + ) + do residual_suffix + ) + do cleaning +) + +/* + First works 2010/07/19 + First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0 + Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos + Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0 +*/ diff --git a/contrib/snowball/algorithms/danish.sbl b/contrib/snowball/algorithms/danish.sbl new file mode 100644 index 0000000..761270f --- /dev/null +++ b/contrib/snowball/algorithms/danish.sbl @@ -0,0 +1,93 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix + undouble +) + +externals ( stem ) + +strings ( ch ) + +integers ( p1 x ) + +groupings ( c v s_ending ) + +stringescapes {} + +/* special characters */ + +stringdef ae '{U+00E6}' +stringdef ao '{U+00E5}' +stringdef o/ '{U+00F8}' + +define c 'bcdfghjklmnpqrstvwxz' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'abcdfghjklmnoprtvyz{ao}' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' + 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' + 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' + 'erets' 'et' 'eret' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'gd' // significant in the call from other_suffix + 'dt' 'gt' 'kt' + ) + ) + next] delete + ) + + define other_suffix as ( + do ( ['st'] 'ig' delete ) + setlimit tomark p1 for ([substring]) + among( + 'ig' 'lig' 'elig' 'els' + (delete do consonant_pair) + 'l{o/}st' + (<-'l{o/}s') + ) + ) + define undouble as ( + setlimit tomark p1 for ([c] ->ch) + ch + delete + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + do undouble + ) +) diff --git a/contrib/snowball/algorithms/dutch.sbl b/contrib/snowball/algorithms/dutch.sbl new file mode 100644 index 0000000..f24c82d --- /dev/null +++ b/contrib/snowball/algorithms/dutch.sbl @@ -0,0 +1,164 @@ +routines ( + prelude postlude + e_ending + en_ending + mark_regions + R1 R2 + undouble + standard_suffix +) + +externals ( stem ) + +booleans ( e_found ) + +integers ( p1 p2 ) + +groupings ( v v_I v_j ) + +stringescapes {} + +/* special characters */ + +stringdef a" '{U+00E4}' +stringdef e" '{U+00EB}' +stringdef i" '{U+00EF}' +stringdef o" '{U+00F6}' +stringdef u" '{U+00FC}' + +stringdef a' '{U+00E1}' +stringdef e' '{U+00E9}' +stringdef i' '{U+00ED}' +stringdef o' '{U+00F3}' +stringdef u' '{U+00FA}' + +stringdef e` '{U+00E8}' + +define v 'aeiouy{e`}' +define v_I v + 'I' +define v_j v + 'j' + +define prelude as ( + test repeat ( + [substring] among( + '{a"}' '{a'}' + (<- 'a') + '{e"}' '{e'}' + (<- 'e') + '{i"}' '{i'}' + (<- 'i') + '{o"}' '{o'}' + (<- 'o') + '{u"}' '{u'}' + (<- 'u') + '' (next) + ) //or next + ) + try(['y'] <- 'Y') + repeat goto ( + v [('i'] v <- 'I') or + ('y'] <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + gopast v gopast non-v setmark p1 + try($p1 < 3 $p1 = 3) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'I' (<- 'i') + '' (next) + ) //or next + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define undouble as ( + test among('kk' 'dd' 'tt') [next] delete + ) + + define e_ending as ( + unset e_found + ['e'] R1 test non-v delete + set e_found + undouble + ) + + define en_ending as ( + R1 non-v and not 'gem' delete + undouble + ) + + define standard_suffix as ( + do ( + [substring] among( + 'heden' + ( R1 <- 'heid' + ) + 'en' 'ene' + ( en_ending + ) + 's' 'se' + ( R1 non-v_j delete + ) + ) + ) + do e_ending + + do ( ['heid'] R2 not 'c' delete + ['en'] en_ending + ) + + do ( + [substring] among( + 'end' 'ing' + ( R2 delete + (['ig'] R2 not 'e' delete) or undouble + ) + 'ig' + ( R2 not 'e' delete + ) + 'lijk' + ( R2 delete e_ending + ) + 'baar' + ( R2 delete + ) + 'bar' + ( R2 e_found delete + ) + ) + ) + do ( + non-v_I + test ( + among ('aa' 'ee' 'oo' 'uu') + non-v + ) + [next] delete + ) + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/english.sbl b/contrib/snowball/algorithms/english.sbl new file mode 100644 index 0000000..fe18d7a --- /dev/null +++ b/contrib/snowball/algorithms/english.sbl @@ -0,0 +1,229 @@ +integers ( p1 p2 ) +booleans ( Y_found ) + +routines ( + prelude postlude + mark_regions + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 + exception1 + exception2 +) + +externals ( stem ) + +groupings ( v v_WXY valid_LI ) + +stringescapes {} + +define v 'aeiouy' +define v_WXY v + 'wxY' + +define valid_LI 'cdeghkmnrt' + +define prelude as ( + unset Y_found + do ( ['{'}'] delete) + do ( ['y'] <-'Y' set Y_found) + do repeat(goto (v ['y']) <-'Y' set Y_found) +) + +define mark_regions as ( + $p1 = limit + $p2 = limit + do( + among ( + 'gener' + 'commun' // added May 2005 + 'arsen' // added Nov 2006 (arsenic/arsenal) + // ... extensions possible here ... + ) or (gopast v gopast non-v) + setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define shortv as ( + ( non-v_WXY v non-v ) + or + ( non-v v atlimit ) + ) + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define Step_1a as ( + try ( + [substring] among ( + '{'}' '{'}s' '{'}s{'}' + (delete) + ) + ) + [substring] among ( + 'sses' (<-'ss') + 'ied' 'ies' + ((hop 2 <-'i') or <-'ie') + 's' (next gopast v delete) + 'us' 'ss' + ) + ) + + define Step_1b as ( + [substring] among ( + 'eed' 'eedly' + (R1 <-'ee') + 'ed' 'edly' 'ing' 'ingly' + ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) + + define Step_1c as ( + ['y' or 'Y'] + non-v not atlimit + <-'i' + ) + + define Step_2 as ( + [substring] R1 among ( + 'tional' (<-'tion') + 'enci' (<-'ence') + 'anci' (<-'ance') + 'abli' (<-'able') + 'entli' (<-'ent') + 'izer' 'ization' + (<-'ize') + 'ational' 'ation' 'ator' + (<-'ate') + 'alism' 'aliti' 'alli' + (<-'al') + 'fulness' (<-'ful') + 'ousli' 'ousness' + (<-'ous') + 'iveness' 'iviti' + (<-'ive') + 'biliti' 'bli' + (<-'ble') + 'ogi' ('l' <-'og') + 'fulli' (<-'ful') + 'lessli' (<-'less') + 'li' (valid_LI delete) + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'tional' (<- 'tion') + 'ational' (<- 'ate') + 'alize' (<-'al') + 'icate' 'iciti' 'ical' + (<-'ic') + 'ful' 'ness' + (delete) + 'ative' + (R2 delete) // 'R2' added Dec 2001 + ) + ) + + define Step_4 as ( + [substring] R2 among ( + 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' + 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' + (delete) + 'ion' ('s' or 't' delete) + ) + ) + + define Step_5 as ( + [substring] among ( + 'e' (R2 or (R1 not shortv) delete) + 'l' (R2 'l' delete) + ) + ) + + define exception2 as ( + + [substring] atlimit among( + 'inning' 'outing' 'canning' 'herring' 'earring' + 'proceed' 'exceed' 'succeed' + + // ... extensions possible here ... + + ) + ) +) + +define exception1 as ( + + [substring] atlimit among( + + /* special changes: */ + + 'skis' (<-'ski') + 'skies' (<-'sky') + 'dying' (<-'die') + 'lying' (<-'lie') + 'tying' (<-'tie') + + /* special -LY cases */ + + 'idly' (<-'idl') + 'gently' (<-'gentl') + 'ugly' (<-'ugli') + 'early' (<-'earli') + 'only' (<-'onli') + 'singly' (<-'singl') + + // ... extensions possible here ... + + /* invariant forms: */ + + 'sky' + 'news' + 'howe' + + 'atlas' 'cosmos' 'bias' 'andes' // not plural forms + + // ... extensions possible here ... + ) +) + +define postlude as (Y_found repeat(goto (['Y']) <-'y')) + +define stem as ( + + exception1 or + not hop 3 or ( + do prelude + do mark_regions + backwards ( + + do Step_1a + + exception2 or ( + + do Step_1b + do Step_1c + + do Step_2 + do Step_3 + do Step_4 + + do Step_5 + ) + ) + do postlude + ) +) diff --git a/contrib/snowball/algorithms/finnish.sbl b/contrib/snowball/algorithms/finnish.sbl new file mode 100644 index 0000000..3891d22 --- /dev/null +++ b/contrib/snowball/algorithms/finnish.sbl @@ -0,0 +1,197 @@ + +/* Finnish stemmer. + + Numbers in square brackets refer to the sections in + Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 + ISBN 0-415-20705-3 + +*/ + +routines ( + mark_regions + R2 + particle_etc possessive + LONG VI + case_ending + i_plural + t_plural + other_endings + tidy +) + +externals ( stem ) + +integers ( p1 p2 ) +strings ( x ) +booleans ( ending_removed ) +groupings ( AEI C V1 V2 particle_end ) + +stringescapes {} + +/* special characters */ + +stringdef a" '{U+00E4}' +stringdef o" '{U+00F6}' + +define AEI 'a{a"}ei' +define C 'bcdfghjklmnpqrstvwxz' +define V1 'aeiouy{a"}{o"}' +define V2 'aeiou{a"}{o"}' +define particle_end V1 + 'nt' + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + goto V1 gopast non-V1 setmark p1 + goto V1 gopast non-V1 setmark p2 +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define particle_etc as ( + setlimit tomark p1 for ([substring]) + among( + 'kin' + 'kaan' 'k{a"}{a"}n' + 'ko' 'k{o"}' + 'han' 'h{a"}n' + 'pa' 'p{a"}' // Particles [91] + (particle_end) + 'sti' // Adverb [87] + (R2) + ) + delete + ) + define possessive as ( // [36] + setlimit tomark p1 for ([substring]) + among( + 'si' + (not 'k' delete) // take 'ksi' as the Comitative case + 'ni' + (delete ['kse'] <- 'ksi') // kseni = ksi + ni + 'nsa' 'ns{a"}' + 'mme' + 'nne' + (delete) + /* Now for Vn possessives after case endings: [36] */ + 'an' + (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) + '{a"}n' + (among('t{a"}' 'ss{a"}' 'st{a"}' + 'll{a"}' 'lt{a"}' 'n{a"}') delete) + 'en' + (among('lle' 'ine') delete) + ) + ) + + define LONG as + among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') + + define VI as ('i' V2) + + define case_ending as ( + setlimit tomark p1 for ([substring]) + among( + 'han' ('a') //-. + 'hen' ('e') // | + 'hin' ('i') // | + 'hon' ('o') // | + 'h{a"}n' ('{a"}') // Illative [43] + 'h{o"}n' ('{o"}') // | + 'siin' VI // | + 'seen' LONG //-' + + 'den' VI + 'tten' VI // Genitive plurals [34] + () + 'n' // Genitive or Illative + ( try ( LONG // Illative + or 'ie' // Genitive + and next ] + ) + /* otherwise Genitive */ + ) + + 'a' '{a"}' //-. + (V1 C) // | + 'tta' 'tt{a"}' // Partitive [32] + ('e') // | + 'ta' 't{a"}' //-' + + 'ssa' 'ss{a"}' // Inessive [41] + 'sta' 'st{a"}' // Elative [42] + + 'lla' 'll{a"}' // Adessive [44] + 'lta' 'lt{a"}' // Ablative [51] + 'lle' // Allative [46] + 'na' 'n{a"}' // Essive [49] + 'ksi' // Translative[50] + 'ine' // Comitative [51] + + /* Abessive and Instructive are too rare for + inclusion [51] */ + + ) + delete + set ending_removed + ) + define other_endings as ( + setlimit tomark p2 for ([substring]) + among( + 'mpi' 'mpa' 'mp{a"}' + 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] + (not 'po') //-improves things + 'impi' 'impa' 'imp{a"}' + 'immi' 'imma' 'imm{a"}' // Superlative forms [86] + 'eja' 'ej{a"}' // indicates agent [93.1B] + ) + delete + ) + define i_plural as ( // [26] + setlimit tomark p1 for ([substring]) + among( + 'i' 'j' + ) + delete + ) + define t_plural as ( // [26] + setlimit tomark p1 for ( + ['t'] test V1 + delete + ) + setlimit tomark p2 for ([substring]) + among( + 'mma' (not 'po') //-mmat endings + 'imma' //-immat endings + ) + delete + ) + define tidy as ( + setlimit tomark p1 for ( + do ( LONG and ([next] delete ) ) // undouble vowel + do ( [AEI] C delete ) // remove trailing a, a", e, i + do ( ['j'] 'o' or 'u' delete ) + do ( ['o'] 'j' delete ) + ) + goto non-V1 [C] -> x x delete // undouble consonant + ) +) + +define stem as ( + + do mark_regions + unset ending_removed + backwards ( + do particle_etc + do possessive + do case_ending + do other_endings + (ending_removed do i_plural) or do t_plural + do tidy + ) +) + diff --git a/contrib/snowball/algorithms/french.sbl b/contrib/snowball/algorithms/french.sbl new file mode 100644 index 0000000..5c4f32d --- /dev/null +++ b/contrib/snowball/algorithms/french.sbl @@ -0,0 +1,254 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + i_verb_suffix + verb_suffix + residual_suffix + un_double + un_accent +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v keep_with_s ) + +stringescapes {} + +/* special characters */ + +stringdef a^ '{U+00E2}' // a-circumflex +stringdef a` '{U+00E0}' // a-grave +stringdef c, '{U+00E7}' // c-cedilla + +stringdef e" '{U+00EB}' // e-diaeresis (rare) +stringdef e' '{U+00E9}' // e-acute +stringdef e^ '{U+00EA}' // e-circumflex +stringdef e` '{U+00E8}' // e-grave +stringdef i" '{U+00EF}' // i-diaeresis +stringdef i^ '{U+00EE}' // i-circumflex +stringdef o^ '{U+00F4}' // o-circumflex +stringdef u^ '{U+00FB}' // u-circumflex +stringdef u` '{U+00F9}' // u-grave + +define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' + +define prelude as repeat goto ( + + ( v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') or + ('y' ] <- 'Y') + ) + or + ( [ '{e"}' ] <- 'He' ) + or + ( [ '{i"}' ] <- 'Hi' ) + or + ( ['y'] v <- 'Y' ) + or + ( 'q' ['u'] <- 'U' ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v v next ) + or + among ( // this exception list begun Nov 2006 + 'par' // paris, parie, pari + 'col' // colis + 'tap' // tapis + // extensions possible here + ) + or + ( next gopast v ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + 'Y' (<- 'y') + 'He' (<- '{e"}') + 'Hi' (<- '{i"}') + 'H' (delete) + '' (next) + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' + 'ances' 'iqUes' 'ismes' 'ables' 'istes' + ( R2 delete ) + 'atrice' 'ateur' 'ation' + 'atrices' 'ateurs' 'ations' + ( R2 delete + try ( ['ic'] (R2 delete) or <-'iqU' ) + ) + 'logie' + 'logies' + ( R2 <- 'log' ) + 'usion' 'ution' + 'usions' 'utions' + ( R2 <- 'u' ) + 'ence' + 'ences' + ( R2 <- 'ent' ) + 'ement' + 'ements' + ( + RV delete + try ( + [substring] among( + 'iv' (R2 delete ['at'] R2 delete) + 'eus' ((R2 delete) or (R1<-'eux')) + 'abl' 'iqU' + (R2 delete) + 'i{e`}r' 'I{e`}r' //) + (RV <-'i') //)--new 2 Sept 02 + ) + ) + ) + 'it{e'}' + 'it{e'}s' + ( + R2 delete + try ( + [substring] among( + 'abil' ((R2 delete) or <-'abl') + 'ic' ((R2 delete) or <-'iqU') + 'iv' (R2 delete) + ) + ) + ) + 'if' 'ive' + 'ifs' 'ives' + ( + R2 delete + try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) + ) + 'eaux' (<- 'eau') + 'aux' (R1 <- 'al') + 'euse' + 'euses'((R2 delete) or (R1<-'eux')) + + 'issement' + 'issements'(R1 non-v delete) // verbal + + // fail(...) below forces entry to verb_suffix. -ment typically + // follows the p.p., e.g 'confus{e'}ment'. + + 'amment' (RV fail(<- 'ant')) + 'emment' (RV fail(<- 'ent')) + 'ment' + 'ments' (test(v RV) fail(delete)) + // v is e,i,u,{e'},I or U + ) + ) + + define i_verb_suffix as setlimit tomark pV for ( + [substring] among ( + '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' + 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' + 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' + 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' + 'issez' 'issiez' 'issions' 'issons' 'it' + (not 'H' non-v delete) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among ( + 'ions' + (R2 delete) + + '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' + 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' + 'erons' 'eront' 'ez' 'iez' + + // 'ons' //-best omitted + + (delete) + + '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' + 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' + 'assions' + (delete + try(['e'] delete) + ) + ) + ) + + define keep_with_s 'aiou{e`}s' + + define residual_suffix as ( + try(['s'] test ('Hi' or non-keep_with_s) delete) + setlimit tomark pV for ( + [substring] among( + 'ion' (R2 's' or 't' delete) + 'ier' 'i{e`}re' + 'Ier' 'I{e`}re' (<-'i') + 'e' (delete) + ) + ) + ) + + define un_double as ( + test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete + ) + + define un_accent as ( + atleast 1 non-v + [ '{e'}' or '{e`}' ] <-'e' + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards ( + + do ( + ( + ( standard_suffix or + i_verb_suffix or + verb_suffix + ) + and + try( [ ('Y' ] <- 'i' ) or + ('{c,}'] <- 'c' ) + ) + ) or + residual_suffix + ) + + // try(['ent'] RV delete) // is best omitted + + do un_double + do un_accent + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/german.sbl b/contrib/snowball/algorithms/german.sbl new file mode 100644 index 0000000..61f24ef --- /dev/null +++ b/contrib/snowball/algorithms/german.sbl @@ -0,0 +1,139 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters */ + +stringdef a" '{U+00E4}' +stringdef o" '{U+00F6}' +stringdef u" '{U+00FC}' +stringdef ss '{U+00DF}' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat ( + ( + ['{ss}'] <- 'ss' + ) or next + ) + + repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/german2.sbl b/contrib/snowball/algorithms/german2.sbl new file mode 100644 index 0000000..47ff61e --- /dev/null +++ b/contrib/snowball/algorithms/german2.sbl @@ -0,0 +1,145 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters */ + +stringdef a" '{U+00E4}' +stringdef o" '{U+00F6}' +stringdef u" '{U+00FC}' +stringdef ss '{U+00DF}' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) + + repeat ( + [substring] among( + '{ss}' (<- 'ss') + 'ae' (<- '{a"}') + 'oe' (<- '{o"}') + 'ue' (<- '{u"}') + 'qu' (hop 2) + '' (next) + ) + ) + +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/greek.sbl b/contrib/snowball/algorithms/greek.sbl new file mode 100644 index 0000000..02df6c3 --- /dev/null +++ b/contrib/snowball/algorithms/greek.sbl @@ -0,0 +1,706 @@ +// A stemmer for Modern Greek language, based on: +// +// Ntais, Georgios. Development of a Stemmer for the Greek +// Language. Diss. Royal Institute of Technology, 2006. +// https://sais.se/mthprize/2007/ntais2007.pdf +// +// Saroukos, Spyridon. Enhancing a Greek language stemmer. +// University of Tampere, 2008. +// https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf + +stringescapes {} + +stringdef a '{U+03B1}' // alpha +stringdef v '{U+03B2}' // beta +stringdef g '{U+03B3}' // gamma +stringdef d '{U+03B4}' // delta +stringdef e '{U+03B5}' // epsilon +stringdef z '{U+03B6}' // zeta +stringdef i '{U+03B7}' // eta +stringdef th '{U+03B8}' // theta +stringdef y '{U+03B9}' // iota +stringdef k '{U+03BA}' // kappa +stringdef l '{U+03BB}' // lamda +stringdef m '{U+03BC}' // mu +stringdef n '{U+03BD}' // nu +stringdef x '{U+03BE}' // xi +stringdef o '{U+03BF}' // omicron +stringdef p '{U+03C0}' // pi +stringdef r '{U+03C1}' // rho +stringdef ss '{U+03C2}' // sigma final +stringdef s '{U+03C3}' // sigma +stringdef t '{U+03C4}' // tau +stringdef u '{U+03C5}' // upsilon +stringdef f '{U+03C6}' // phi +stringdef ch '{U+03C7}' // chi +stringdef ps '{U+03C8}' // psi +stringdef oo '{U+03C9}' // omega + +stringdef A '{U+0391}' // Alpha +stringdef V '{U+0392}' // Beta +stringdef G '{U+0393}' // Gamma +stringdef D '{U+0394}' // Delta +stringdef E '{U+0395}' // Epsilon +stringdef Z '{U+0396}' // Zeta +stringdef I '{U+0397}' // Eta +stringdef Th '{U+0398}' // Theta +stringdef Y '{U+0399}' // Iota +stringdef K '{U+039A}' // Kappa +stringdef L '{U+039B}' // Lamda +stringdef M '{U+039C}' // Mu +stringdef N '{U+039D}' // Nu +stringdef X '{U+039E}' // Xi +stringdef O '{U+039F}' // Omicron +stringdef P '{U+03A0}' // Pi +stringdef R '{U+03A1}' // Rho +stringdef S '{U+03A3}' // Sigma +stringdef T '{U+03A4}' // Tau +stringdef U '{U+03A5}' // Upsilon +stringdef F '{U+03A6}' // Phi +stringdef Ch '{U+03A7}' // Chi +stringdef Ps '{U+03A8}' // Psi +stringdef Oo '{U+03A9}' // Omega + +stringdef Y: '{U+03AA}' // Iota with dialytika +stringdef U: '{U+03AB}' // Upsilon with dialytika + +stringdef a' '{U+03AC}' // alpha with tonos +stringdef e' '{U+03AD}' // epsilon with tonos +stringdef i' '{U+03AE}' // eta with tonos +stringdef y' '{U+03AF}' // iota with tonos +stringdef o' '{U+03CC}' // omicron with tonos +stringdef u' '{U+03CD}' // upsilon with tonos +stringdef oo' '{U+03CE}' // omega with tonos + +stringdef i:' '{U+0390}' // iota with dialytika and tonos +stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos + +stringdef i: '{U+03CA}' // iota with dialytika +stringdef u: '{U+03CB}' // upsilon with dialytika + +stringdef A' '{U+0386}' // Alpha with tonos +stringdef E' '{U+0388}' // Epsilon with tonos +stringdef I' '{U+0389}' // Eta with tonos +stringdef Y' '{U+038A}' // Iota with tonos +stringdef O' '{U+038C}' // Omicron with tonos +stringdef U' '{U+038E}' // Upsilon with tonos +stringdef OO' '{U+038F}' // Omega with tonos + +externals ( stem ) + +booleans ( test1 ) + +groupings ( v v2 ) + +routines ( tolower has_min_length + steps1 steps2 steps3 steps4 steps5 steps6 steps7 + steps8 steps9 steps10 + step1 step2a step2b step2c step2d step3 step4 + step5a step5b step5c step5d step5e step5f + step5g step5h step5i + step5j step5k step5l step5m + step6 step7 ) + +define v '{a}{e}{i}{y}{o}{u}{oo}' +define v2 '{a}{e}{i}{y}{o}{oo}' + +backwardmode ( + define has_min_length as ( + $(len >= 3) + ) + + define tolower as ( + repeat ( + [substring] among ( + '{A}' (<- '{a}') + '{V}' (<- '{v}') + '{G}' (<- '{g}') + '{D}' (<- '{d}') + '{E}' (<- '{e}') + '{Z}' (<- '{z}') + '{I}' (<- '{i}') + '{Th}' (<- '{th}') + '{Y}' (<- '{y}') + '{K}' (<- '{k}') + '{L}' (<- '{l}') + '{M}' (<- '{m}') + '{N}' (<- '{n}') + '{X}' (<- '{x}') + '{O}' (<- '{o}') + '{P}' (<- '{p}') + '{R}' (<- '{r}') + '{S}' (<- '{s}') + '{T}' (<- '{t}') + '{U}' (<- '{u}') + '{F}' (<- '{f}') + '{Ch}' (<- '{ch}') + '{Ps}' (<- '{ps}') + '{Oo}' (<- '{oo}') + '{Y:}' (<- '{y}') + '{U:}' (<- '{u}') + '{a'}' (<- '{a}') + '{e'}' (<- '{e}') + '{i'}' (<- '{i}') + '{y'}' (<- '{y}') + '{o'}' (<- '{o}') + '{u'}' (<- '{u}') + '{oo'}' (<- '{oo}') + '{i:'}' (<- '{i}') + '{u:'}' (<- '{u}') + '{i:}' (<- '{i}') + '{u:}' (<- '{u}') + '{A'}' (<- '{a}') + '{E'}' (<- '{e}') + '{I'}' (<- '{i}') + '{Y'}' (<- '{y}') + '{O'}' (<- '{o}') + '{U'}' (<- '{u}') + '{OO'}' (<- '{oo}') + '{ss}' (<- '{s}') + '' (next) + ) + ) + ) + + define step1 as ( + [substring] among ( + '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}') + '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}') + '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}') + '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}') + '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}') + '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}') + '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}') + '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}') + '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}') + '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}') + '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}') + ) + unset test1 + ) + + define steps1 as ( + [substring] among ( + '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}' + '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' ( + delete + unset test1 + ([] substring atlimit among ( + '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' + (<- '{y}') + )) or + ([] substring atlimit among ( + '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}' + '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}' + '{r}' '{p}{y}{p}{e}{r}{o}{r}' + (<- '{y}{z}') + )) + ) + ) + ) + + define steps2 as ( + [substring] among ( + '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}') + ) + ) + ) + ) + + define steps3 as ( + [substring] among ( + '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' ( + delete + unset test1 + ('{y}{s}{a}' atlimit <- '{y}{s}') or + ([] substring atlimit among ( + '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' + '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' + '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' + (<- '{y}') + )) or + ([] substring atlimit among ( + '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}' + '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}' + (<- '{y}{s}') + )) + ) + ) + ) + + define steps4 as ( + [substring] among ( + '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' + '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' + '{e}{k}{l}{e}' '{p}{e}' + (<- '{y}') + ) + ) + ) + ) + + define steps5 as ( + [substring] among ( + '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}' + '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' ( + delete + unset test1 + ([] substring atlimit among ( + '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}' + (<- '{y}') + )) or + ([] substring atlimit among ( + '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}' + '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}' + '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}' + (<- '{y}{s}{t}') + )) + ) + ) + ) + + define steps6 as ( + [substring] among ( + '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' ( + delete + unset test1 + ([] substring atlimit among ( + '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}' + (<- '{y}{s}{m}') + )) or + ([] substring atlimit among ( + '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}' + (<- '{y}') + )) or + ([substring] among ( + '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}') + '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}') + '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}') + '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}') + '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}') + '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}') + '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}') + '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}') + '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}') + '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}') + )) + ) + ) + ) + + define steps7 as ( + [substring] among ( + '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' ( + delete + unset test1 + [] substring atlimit among ( + '{s}' '{ch}' + (<- '{a}{r}{a}{k}') + ) + ) + ) + ) + + define steps8 as ( + [substring] among ( + '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' ( + delete + unset test1 + ([] substring atlimit among ( + '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}' + '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}' + '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}' + '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}' + (<- '{a}{k}') + )) or + ([] substring atlimit among ( + '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{k}{o}{n}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}' + '{p}{a}{t}{e}{r}' '{p}' '{s}{k}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}' + (<- '{y}{t}{s}') + )) or + ([] '{k}{o}{r}' <- '{y}{t}{s}') + ) + ) + ) + + define steps9 as ( + [substring] among ( + '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' ( + delete + unset test1 + ([] substring atlimit among ( + '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}') + )) or + ([] substring among ( + '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}') + )) + ) + ) + ) + + define steps10 as ( + [substring] among ( + '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}' + (<- '{y}{s}{k}') + ) + ) + ) + ) + + define step2a as ( + [substring] among ( + '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete) + ) + not ([substring] among ( + '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}' + )) + insert '{a}{d}' + ) + + define step2b as ( + [substring] among ( + '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete) + ) + [] substring among ( + '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}') + ) + ) + + define step2c as ( + [substring] among ( + '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete) + ) + [] substring among ( + '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}' + '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}') + ) + ) + + define step2d as ( + [substring] among ( + '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1) + ) + [] substring atlimit among ( + '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}') + ) + ) + + define step3 as ( + [substring] among ( + '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1) + ) + ([] v <- '{y}') + ) + + define step4 as ( + [substring] among ( + '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1) + ) + ([] v <- '{y}{k}') or + [] substring atlimit among ( + '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}' + '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}' + '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}' + '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}' + (<- '{y}{k}') + ) + ) + + define step5a as ( + do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}') + do ( + [substring] among ( + '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1) + ) + ) + ['{a}{m}{e}'] + delete + unset test1 + [] substring atlimit among ( + '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}' + (<- '{a}{m}') + ) + ) + + define step5b as ( + do ( + [substring] among ( + '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}' + '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}') + ) + ) + ) + ) + ['{a}{n}{e}'] + delete + unset test1 + ([] v2 <- '{a}{n}') or + [] substring atlimit among ( + '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}' + '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}' + '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}' + '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}' + '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}' + '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}' + '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}' + '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}' + '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}' + '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}' + '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}' + '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}' + '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}' + '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}' + '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}' + '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}' + (<- '{a}{n}') + ) + ) + + define step5c as ( + do ( + [substring] among ( + '{i}{s}{e}{t}{e}' (delete unset test1) + ) + ) + ['{e}{t}{e}'] + delete + unset test1 + ([] v2 <- '{e}{t}') or + ([] substring among ( + '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}' + '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}' + '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}' + (<- '{e}{t}') + )) or + [] substring atlimit among ( + '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}' + '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}' + '{th}{a}{r}{r}' '{th}' + (<- '{e}{t}') + ) + ) + + define step5d as ( + [substring] among ( + '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' ( + delete + unset test1 + ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or + ([] '{k}{r}{e}' <- '{oo}{n}{t}') + ) + ) + ) + + define step5e as ( + [substring] among ( + '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' ( + delete + unset test1 + ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}') + ) + ) + ) + + define step5f as ( + do ( + ['{y}{e}{s}{t}{e}'] + delete + unset test1 + [] substring atlimit among ( + '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}') + ) + ) + ['{e}{s}{t}{e}'] + delete + unset test1 + [] substring atlimit among ( + '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}' + (<- '{y}{e}{s}{t}') + ) + ) + + define step5g as ( + do ( + [substring] among ( + '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1) + ) + ) + [substring] among ( + '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' ( + delete + unset test1 + ([] substring among ( + '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}') + )) or + ([] substring atlimit among ( + '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}') + )) + ) + ) + ) + + define step5h as ( + [substring] among ( + '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' ( + delete + unset test1 + ([] substring among ( + '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}' + '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}') + + )) or + ([] substring atlimit among ( + '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}' + '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}' + '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}' + (<- '{o}{u}{s}') + )) + ) + ) + ) + + define step5i as ( + [substring] among ( + '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' ( + delete + unset test1 + ([] '{k}{o}{l}{l}' <- '{a}{g}') or ( + not ([substring] among ('{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}')) + ([] substring among ( + '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}' + (<- '{a}{g}') + )) or + ([] substring atlimit among ( + '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}' + '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}' + '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}' + '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}' + '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}' + (<- '{a}{g}') + )) + ) + ) + ) + ) + + define step5j as ( + [substring] among ( + '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1) + ) + [] substring atlimit among ( + '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}') + ) + ) + + define step5k as ( + [substring] among ( + '{i}{s}{t}{e}' (delete unset test1) + ) + [] substring atlimit among ( + '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}' + (<- '{i}{s}{t}') + ) + ) + + define step5l as ( + [substring] among ( + '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1) + ) + [] substring atlimit among ( + '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}') + ) + ) + + define step5m as ( + [substring] among ( + '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1) + ) + [] substring atlimit among ( + '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}' + (<- '{o}{u}{m}') + ) + ) + + define step6 as ( + do ( + [substring] among ( + '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}') + ) + ) + test1 + [substring] among ( + '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}' + '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}' + '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}' + '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}' + '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}' + '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}' + '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}' + '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}' + '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}' + '{oo}{n}' (delete) + ) + ) + + define step7 as ( + [substring] among ( + '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete) + ) + ) +) + +define stem as ( + backwards ( + do tolower + has_min_length + set test1 + do step1 + do steps1 + do steps2 + do steps3 + do steps4 + do steps5 + do steps6 + do steps7 + do steps8 + do steps9 + do steps10 + do step2a + do step2b + do step2c + do step2d + do step3 + do step4 + do step5a + do step5b + do step5c + do step5d + do step5e + do step5f + do step5g + do step5h + do step5j + do step5i + do step5k + do step5l + do step5m + do step6 + do step7 + ) +) diff --git a/contrib/snowball/algorithms/hindi.sbl b/contrib/snowball/algorithms/hindi.sbl new file mode 100644 index 0000000..bfdfac0 --- /dev/null +++ b/contrib/snowball/algorithms/hindi.sbl @@ -0,0 +1,323 @@ +// An implementation of "A Lightweight Stemmer for Hindi": +// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf + +externals ( stem ) + +stringescapes {} + +// The transliteration scheme used for our stringdefs matches that used in the +// paper, as documented in the appendix. It appears to match the WX notation +// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently +// uses 'z' for Anunasika whereas the paper uses Mh. +// +// We discriminate dependent vowels by adding a leading "_" to their stringdef +// names (mnemonic: the _ signifies removing the implicit a from the preceding +// character). + +// Vowels and sonorants: +stringdef a '{U+0905}' +stringdef A '{U+0906}' +stringdef i '{U+0907}' +stringdef I '{U+0908}' +stringdef u '{U+0909}' +stringdef U '{U+090A}' +stringdef q '{U+090B}' +stringdef e '{U+090F}' +stringdef E '{U+0910}' +stringdef o '{U+0913}' +stringdef O '{U+0914}' + +// Vowel signs: +stringdef _A '{U+093E}' +stringdef _i '{U+093F}' +stringdef _I '{U+0940}' +stringdef _u '{U+0941}' +stringdef _U '{U+0942}' +stringdef _q '{U+0943}' +stringdef _e '{U+0947}' +stringdef _E '{U+0948}' +stringdef _o '{U+094B}' +stringdef _O '{U+094C}' + +// Diacritics: +stringdef M '{U+0902}' +stringdef H '{U+0903}' +stringdef Mh '{U+0901}' +stringdef Z '{U+093C}' // Nukta +stringdef virama '{U+094D}' + +// Velar consonants: +stringdef k '{U+0915}' +stringdef K '{U+0916}' +stringdef g '{U+0917}' +stringdef G '{U+0918}' +stringdef f '{U+0919}' + +// Palatal consonants: +stringdef c '{U+091A}' +stringdef C '{U+091B}' +stringdef j '{U+091C}' +stringdef J '{U+091D}' +stringdef F '{U+091E}' + +// Retroflex consonants: +stringdef t '{U+091F}' +stringdef T '{U+0920}' +stringdef d '{U+0921}' +stringdef D '{U+0922}' +stringdef N '{U+0923}' + +// Dental consonants: +stringdef w '{U+0924}' +stringdef W '{U+0925}' +stringdef x '{U+0926}' +stringdef X '{U+0927}' +stringdef n '{U+0928}' + +// Labial consonants: +stringdef p '{U+092A}' +stringdef P '{U+092B}' +stringdef b '{U+092C}' +stringdef B '{U+092D}' +stringdef m '{U+092E}' + +// Semi-vowels: +stringdef y '{U+092F}' +stringdef r '{U+0930}' +stringdef l '{U+0932}' +stringdef v '{U+0935}' + +// Fricatives: +stringdef S '{U+0936}' +stringdef R '{U+0937}' +stringdef s '{U+0938}' +stringdef h '{U+0939}' + +stringdef lY '{U+0933}' + +// Precomposed characters - letters + nukta: +stringdef nZ '{U+0929}' // ≡ {n}{Z} +stringdef rZ '{U+0931}' // ≡ {r}{Z} +stringdef lYZ '{U+0934}' // ≡ {lY}{Z} +stringdef kZ '{U+0958}' // ≡ {k}{Z} +stringdef KZ '{U+0959}' // ≡ {K}{Z} +stringdef gZ '{U+095A}' // ≡ {g}{Z} +stringdef jZ '{U+095B}' // ≡ {j}{Z} +stringdef dZ '{U+095C}' // ≡ {d}{Z} +stringdef DZ '{U+095D}' // ≡ {D}{Z} +stringdef PZ '{U+095E}' // ≡ {P}{Z} +stringdef yZ '{U+095F}' // ≡ {y}{Z} + +integers ( p ) + +groupings ( consonant ) + +routines ( CONSONANT ) + +define consonant '{k}{K}{g}{G}{f}' + + '{c}{C}{j}{J}{F}' + + '{t}{T}{d}{D}{N}' + + '{w}{W}{x}{X}{n}' + + '{p}{P}{b}{B}{m}' + + '{y}{r}{l}{v}' + + '{S}{R}{s}{h}' + + '{lY}' + + '{Z}' + // Nukta + // Precomposed characters - letter and nukta: + '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}' + +backwardmode ( define CONSONANT as ( consonant ) ) + +define stem as ( + test ( next setmark p ) + backwards ( + // We assume in this implementation that the whole word doesn't count + // as a valid suffix to remove, so we remove the longest suffix from + // the list which leaves at least one character. This change affects + // 47 words out of the 65,140 in the sample vocabulary from Hindi + // wikipedia. + setlimit tomark p for ([substring]) + among ( + // The list below is derived from figure 3 in the paper. + // + // We perform the stemming on the Devanagari characters rather than + // transliterating to Latin, so we have adapted the list below to + // reflect this by converting suffixes back to Devanagari as + // follows: + // + // * within the suffixes, "a" after a consonant is dropped since + // consonants have an implicit "a". + // + // * within the suffixes, a vowel other than "a" after a consonant + // is a dependent vowel (vowel sign); a vowel (including "a") + // after a non-consonant is an independent vowel. + // + // * to allow the vowel at the start of each suffix being dependent + // or independent, we include each suffix twice. For the + // dependent version, a leading "a" is dropped and we check that + // the suffix is preceded by a consonant (which will have an + // implicit "a"). + // + // * we add '{a}', which is needed for the example given right at + // the end of section 5 to work (conflating BarawIya and + // BarawIyawA), and which 3.1 a.v strongly suggests should be in + // the list: + // + // Thus, the following suffix deletions (longest possible + // match) are required to reduce inflected forms of masculine + // nouns to a common stem: + // a A i [...] + // + // Adding '{a}' only affect 2 words out of the 65,140 in the + // sample vocabulary. + // + // * The transliterations of our stems would end with "a" when our + // stems end in a consonant, so we also include {virama} in the + // list of suffixes to remove (this affects 222 words from the + // sample vocabulary). + // + // We've also assumed that Mh in the suffix list always means {Mh} + // and never {M}{h}{virama}. Only one of the 65,140 words in the + // sample vocabulary stems differently due to this (and that word + // seems to be a typo). + + '{virama}' + + '{a}' + '{A}' + '{i}' + '{I}' + '{u}' + '{U}' + '{e}' + '{o}' + '{e}{M}' + '{o}{M}' + '{A}{M}' + '{u}{A}{M}' + '{u}{e}{M}' + '{u}{o}{M}' + '{A}{e}{M}' + '{A}{o}{M}' + '{i}{y}{_A}{M}' + '{i}{y}{_o}{M}' + '{A}{i}{y}{_A}{M}' + '{A}{i}{y}{_o}{M}' + '{A}{Mh}' + '{i}{y}{_A}{Mh}' + '{A}{i}{y}{_A}{Mh}' + '{a}{w}{_A}{e}{M}' + '{a}{w}{_A}{o}{M}' + '{a}{n}{_A}{e}{M}' + '{a}{n}{_A}{o}{M}' + '{a}{w}{_A}' + '{a}{w}{_I}' + '{I}{M}' + '{a}{w}{_I}{M}' + '{a}{w}{_e}' + '{A}{w}{_A}' + '{A}{w}{_I}' + '{A}{w}{_I}{M}' + '{A}{w}{_e}' + '{a}{n}{_A}' + '{a}{n}{_I}' + '{a}{n}{_e}' + '{A}{n}{_A}' + '{A}{n}{_e}' + '{U}{M}{g}{_A}' + '{U}{M}{g}{_I}' + '{A}{U}{M}{g}{_A}' + '{A}{U}{M}{g}{_I}' + '{e}{M}{g}{_e}' + '{e}{M}{g}{_I}' + '{A}{e}{M}{g}{_e}' + '{A}{e}{M}{g}{_I}' + '{o}{g}{_e}' + '{o}{g}{_I}' + '{A}{o}{g}{_e}' + '{A}{o}{g}{_I}' + '{e}{g}{_A}' + '{e}{g}{_I}' + '{A}{e}{g}{_A}' + '{A}{e}{g}{_I}' + '{A}{y}{_A}' + '{A}{e}' + '{A}{I}' + '{A}{I}{M}' + '{i}{e}' + '{A}{o}' + '{A}{i}{e}' + '{a}{k}{r}' + '{A}{k}{r}' + + '{_A}' + '{_i}' + '{_I}' + '{_u}' + '{_U}' + '{_e}' + '{_o}' + '{_e}{M}' + '{_o}{M}' + '{_A}{M}' + '{_u}{A}{M}' + '{_u}{e}{M}' + '{_u}{o}{M}' + '{_A}{e}{M}' + '{_A}{o}{M}' + '{_i}{y}{_A}{M}' + '{_i}{y}{_o}{M}' + '{_A}{i}{y}{_A}{M}' + '{_A}{i}{y}{_o}{M}' + '{_A}{Mh}' + '{_i}{y}{_A}{Mh}' + '{_A}{i}{y}{_A}{Mh}' + '{_I}{M}' + '{_A}{w}{_A}' + '{_A}{w}{_I}' + '{_A}{w}{_I}{M}' + '{_A}{w}{_e}' + '{_A}{n}{_A}' + '{_A}{n}{_e}' + '{_U}{M}{g}{_A}' + '{_U}{M}{g}{_I}' + '{_A}{U}{M}{g}{_A}' + '{_A}{U}{M}{g}{_I}' + '{_e}{M}{g}{_e}' + '{_e}{M}{g}{_I}' + '{_A}{e}{M}{g}{_e}' + '{_A}{e}{M}{g}{_I}' + '{_o}{g}{_e}' + '{_o}{g}{_I}' + '{_A}{o}{g}{_e}' + '{_A}{o}{g}{_I}' + '{_e}{g}{_A}' + '{_e}{g}{_I}' + '{_A}{e}{g}{_A}' + '{_A}{e}{g}{_I}' + '{_A}{y}{_A}' + '{_A}{e}' + '{_A}{I}' + '{_A}{I}{M}' + '{_i}{e}' + '{_A}{o}' + '{_A}{i}{e}' + '{_A}{k}{r}' + + /* Suffixes with a leading implicit a: */ + '{w}{_A}{e}{M}' CONSONANT + '{w}{_A}{o}{M}' CONSONANT + '{n}{_A}{e}{M}' CONSONANT + '{n}{_A}{o}{M}' CONSONANT + '{w}{_A}' CONSONANT + '{w}{_I}' CONSONANT + '{w}{_I}{M}' CONSONANT + '{w}{_e}' CONSONANT + '{n}{_A}' CONSONANT + '{n}{_I}' CONSONANT + '{n}{_e}' CONSONANT + '{k}{r}' CONSONANT + ) + delete + ) +) diff --git a/contrib/snowball/algorithms/hungarian.sbl b/contrib/snowball/algorithms/hungarian.sbl new file mode 100644 index 0000000..2d7885c --- /dev/null +++ b/contrib/snowball/algorithms/hungarian.sbl @@ -0,0 +1,241 @@ +/* +Hungarian Stemmer +Removes noun inflections +*/ + +routines ( + mark_regions + R1 + v_ending + case + case_special + case_other + plural + owned + sing_owner + plur_owner + instrum + factive + undouble + double +) + +externals ( stem ) + +integers ( p1 ) +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef a' '{U+00E1}' //a-acute +stringdef e' '{U+00E9}' //e-acute +stringdef i' '{U+00ED}' //i-acute +stringdef o' '{U+00F3}' //o-acute +stringdef o" '{U+00F6}' //o-umlaut +stringdef oq '{U+0151}' //o-double acute +stringdef u' '{U+00FA}' //u-acute +stringdef u" '{U+00FC}' //u-umlaut +stringdef uq '{U+0171}' //u-double acute + +define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' + +define mark_regions as ( + + $p1 = limit + + (v goto non-v + among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next + setmark p1) + or + + (non-v gopast v setmark p1) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define v_ending as ( + [substring] R1 among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define double as ( + test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' + 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') + ) + + define undouble as ( + next [hop 1] delete + ) + + define instrum as( + [substring] R1 among( + 'al' (double) + 'el' (double) + ) + delete + undouble + ) + + + define case as ( + [substring] R1 among( + 'ban' 'ben' + 'ba' 'be' + 'ra' 're' + 'nak' 'nek' + 'val' 'vel' + 't{o'}l' 't{oq}l' + 'r{o'}l' 'r{oq}l' + 'b{o'}l' 'b{oq}l' + 'hoz' 'hez' 'h{o"}z' + 'n{a'}l' 'n{e'}l' + 'ig' + 'at' 'et' 'ot' '{o"}t' + '{e'}rt' + 'k{e'}pp' 'k{e'}ppen' + 'kor' + 'ul' '{u"}l' + 'v{a'}' 'v{e'}' + 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' + 'k{e'}nt' + 'en' 'on' 'an' '{o"}n' + 'n' + 't' + ) + delete + v_ending + ) + + define case_special as( + [substring] R1 among( + '{e'}n' (<- 'e') + '{a'}n' (<- 'a') + '{a'}nk{e'}nt' (<- 'a') + ) + ) + + define case_other as( + [substring] R1 among( + 'astul' 'est{u"}l' (delete) + 'stul' 'st{u"}l' (delete) + '{a'}stul' (<- 'a') + '{e'}st{u"}l' (<- 'e') + ) + ) + + define factive as( + [substring] R1 among( + '{a'}' (double) + '{e'}' (double) + ) + delete + undouble + ) + + define plural as ( + [substring] R1 among( + '{a'}k' (<- 'a') + '{e'}k' (<- 'e') + '{o"}k' (delete) + 'ak' (delete) + 'ok' (delete) + 'ek' (delete) + 'k' (delete) + ) + ) + + define owned as ( + [substring] R1 among ( + 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) + '{e'}k{e'}' (<- 'e') + '{a'}k{e'}' (<- 'a') + 'k{e'}' (delete) + '{e'}{e'}i' (<- 'e') + '{a'}{e'}i' (<- 'a') + '{e'}i' (delete) + '{e'}{e'}' (<- 'e') + '{e'}' (delete) + ) + ) + + define sing_owner as ( + [substring] R1 among( + '{u"}nk' 'unk' (delete) + '{a'}nk' (<- 'a') + '{e'}nk' (<- 'e') + 'nk' (delete) + '{a'}juk' (<- 'a') + '{e'}j{u"}k' (<- 'e') + 'juk' 'j{u"}k' (delete) + 'uk' '{u"}k' (delete) + 'em' 'om' 'am' (delete) + '{a'}m' (<- 'a') + '{e'}m' (<- 'e') + 'm' (delete) + 'od' 'ed' 'ad' '{o"}d' (delete) + '{a'}d' (<- 'a') + '{e'}d' (<- 'e') + 'd' (delete) + 'ja' 'je' (delete) + 'a' 'e' 'o' (delete) + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define plur_owner as ( + [substring] R1 among( + 'jaim' 'jeim' (delete) + '{a'}im' (<- 'a') + '{e'}im' (<- 'e') + 'aim' 'eim' (delete) + 'im' (delete) + 'jaid' 'jeid' (delete) + '{a'}id' (<- 'a') + '{e'}id' (<- 'e') + 'aid' 'eid' (delete) + 'id' (delete) + 'jai' 'jei' (delete) + '{a'}i' (<- 'a') + '{e'}i' (<- 'e') + 'ai' 'ei' (delete) + 'i' (delete) + 'jaink' 'jeink' (delete) + 'eink' 'aink' (delete) + '{a'}ink' (<- 'a') + '{e'}ink' (<- 'e') + 'ink' + 'jaitok' 'jeitek' (delete) + 'aitok' 'eitek' (delete) + '{a'}itok' (<- 'a') + '{e'}itek' (<- 'e') + 'itek' (delete) + 'jeik' 'jaik' (delete) + 'aik' 'eik' (delete) + '{a'}ik' (<- 'a') + '{e'}ik' (<- 'e') + 'ik' (delete) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do instrum + do case + do case_special + do case_other + do factive + do owned + do sing_owner + do plur_owner + do plural + ) +) diff --git a/contrib/snowball/algorithms/indonesian.sbl b/contrib/snowball/algorithms/indonesian.sbl new file mode 100644 index 0000000..ac0ee36 --- /dev/null +++ b/contrib/snowball/algorithms/indonesian.sbl @@ -0,0 +1,192 @@ +// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: +// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf + +integers ( + // The paper defines measure as the number of vowels in the word. We + // count this initially, then adjust the count each time we remove a + // prefix or suffix. + measure + + // Numeric code for the type of prefix removed: + // + // 0 other/none + // 1 'di' or 'meng' or 'ter' + // 2 'per' + // 3 'ke' or 'peng' + // 4 'ber' + // + // Some of these have variant forms, so e.g. "meng" includes "men", "me", + // "meny", "mem". + // + // Note that the value of prefix is only used in remove_suffix (and + // routines it calls) so we don't need to worry about + // remove_second_order_prefix overwriting a value of prefix set by + // remove_first_order_prefix since remove_suffix gets called between + // the two. + prefix +) + +groupings ( vowel ) + +routines ( + remove_particle + remove_possessive_pronoun + remove_first_order_prefix + remove_second_order_prefix + remove_suffix + KER + SUFFIX_KAN_OK + SUFFIX_AN_OK + SUFFIX_I_OK + VOWEL +) + +externals ( stem ) + +stringescapes {} + +backwardmode ( + + define remove_particle as ( + [substring] among ( + 'kah' 'lah' 'pun' (delete $measure-=1) + ) + ) + + define remove_possessive_pronoun as ( + [substring] among ( + 'ku' 'mu' 'nya' (delete $measure-=1) + ) + ) + + // prefix not in {ke, peng, per} + define SUFFIX_KAN_OK as ( + // On page 29, the example "kompas Q.31" says "Both Nazief and Porter + // stemmer converted the word peledakan (blast, explotion) to ledak (to + // blast, to explode)". However, the algorithm as described doesn't + // behave in this way - grammatically the prefix pe- occurs as a + // variation of both the first-order derivational prefix peng- and the + // second-order derivational prefix per-, but table 2.5 doesn't include + // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) + // as having prefix "per" not "peng", and so we remove derivational + // suffix "kan" rather than "an" to give stem leda. (Porter-style + // stemmers remove the longest suffix they can amongst those available, + // which this paper notes in the last paragraph on page 15). + // + // We resolve this by amending the condition on suffix "kan" to + // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's + // behaviour match all the examples in the paper except for one: + // "perbaikan" is shown in table 3.4 as stemming to "bai", but with + // this change it now stems to "baik". The table notes that "baik" is + // the actual root so this deviation is an improvement. In a sample + // vocabulary derived from the most common words in id.wikipedia.org, + // this change only affects 0.12% of words (76 out of 64,587, including + // "peledakan" and "perbaikan"). + $prefix != 3 and $prefix != 2 + ) + + // prefix not in {di, meng, ter} + define SUFFIX_AN_OK as ( $prefix != 1 ) + + define SUFFIX_I_OK as ( + // prefix not in {ke, peng, ber} + $prefix <= 2 + + // The rest of the condition from the paper is: + // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i + // + // The meaning of this is unclear in several ways, and none of the + // examples given of the stemmer's behaviour in the paper help to + // resolve these issues. + // + // Notice that c₂ isn't actually used - the most obvious explanation + // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". + // + // Elsewhere the paper defines V... as meaning "the stem starts with + // a vowel" and K... as meaning "the stem starts with a consonant". + // + // In other places where it says X|Y... it seems the | binds more + // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit + // odd as the first letter must be either a vowel or a consonant, so + // that really just means "ends cᵢcⱼ". However, nowhere in the paper + // uses or defines a notation such as ...X, which may explain this + // seemingly redundant way of specifying this. + // + // The conditions elsewhere on prefix removal (e.g. V...) are clearly + // on the stem left after the prefix is removed. None of the other + // rules for suffix removal have conditions on the stem, but for + // consistency with the prefix rules we might expect that the cᵢcⱼ + // test is on what's left *after* removing the "i" suffix. + // + // However, studying Indonesian wordlists and discussion with a native + // speaker leads us to conclude that the purpose of this check is to + // protect words of foreign origin (e.g. "televisi", "organisasi", + // "komunikasi") from stemming, and the common feature of these is + // that the word ends "-si", so we conclude that the condition here + // should be read as "word does not end -si", and this is what we + // have implemented. + not 's' + ) + + define remove_suffix as ( + [substring] among ( + 'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK + (delete $measure-=1) + ) + ) +) + +define vowel 'aeiou' + +define VOWEL as ( vowel ) + +define KER as ( non-vowel 'er' ) + +define remove_first_order_prefix as ( + [substring] among ( + 'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1) + 'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1) + 'meny' VOWEL ($prefix=1 <-'s' $measure-=1) + 'peny' VOWEL ($prefix=3 <-'s' $measure-=1) + 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) + 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) + ) +) + +define remove_second_order_prefix as ( + // The paper has the condition on removal of prefix "bel" and "pel" as + // just "ajar" not "ajar..." but it seems that the latter must be what + // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". + // This change only affects a very small number of words (11 out of + // 64,587) and only for the better. + [substring] among ( + 'per' 'pe' (delete $prefix=2 $measure-=1) + 'pelajar' (<-'ajar' $measure-=1) + 'ber' (delete $prefix=4 $measure-=1) + 'belajar' (<-'ajar' $prefix=4 $measure-=1) + 'be' KER (delete $prefix=4 $measure-=1) + ) +) + +define stem as ( + $measure = 0 + do ( repeat ( gopast vowel $measure+=1 ) ) + $measure > 2 + $prefix = 0 + backwards ( + do remove_particle + $measure > 2 + do remove_possessive_pronoun + ) + $measure > 2 + test ( + remove_first_order_prefix + do ( + test ($measure > 2 backwards remove_suffix) + $measure > 2 remove_second_order_prefix + ) + ) or ( + do remove_second_order_prefix + do ($measure > 2 backwards remove_suffix) + ) +) diff --git a/contrib/snowball/algorithms/irish.sbl b/contrib/snowball/algorithms/irish.sbl new file mode 100644 index 0000000..0b1288a --- /dev/null +++ b/contrib/snowball/algorithms/irish.sbl @@ -0,0 +1,151 @@ +routines ( + R1 R2 RV + initial_morph + mark_regions + noun_sfx + deriv + verb_sfx +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* Accented characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef e' '{U+00E9}' // e-acute +stringdef i' '{U+00ED}' // i-acute +stringdef o' '{U+00F3}' // o-acute +stringdef u' '{U+00FA}' // u-acute + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + gopast v setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define initial_morph as ( + [substring] among ( + 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic + (delete) + + // verbs + 'd{'}' + (delete) + 'd{'}fh' + (<- 'f') + // other contractions + 'm{'}' 'b{'}' + (delete) + + 'sh' + (<- 's') + + 'mb' + (<- 'b') + 'gc' + (<- 'c') + 'nd' + (<- 'd') + 'bhf' + (<- 'f') + 'ng' + (<- 'g') + 'bp' + (<- 'p') + 'ts' + (<- 's') + 'dt' + (<- 't') + + // Lenition + 'bh' + (<- 'b') + 'ch' + (<- 'c') + 'dh' + (<- 'd') + 'fh' + (<- 'f') + 'gh' + (<- 'g') + 'mh' + (<- 'm') + 'ph' + (<- 'p') + 'th' + (<- 't') + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define noun_sfx as ( + [substring] among ( + 'amh' 'eamh' 'abh' 'eabh' + 'aibh' 'ibh' 'aimh' 'imh' + 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' + (R1 delete) + 'ire' 'ir{i'}' 'aire' 'air{i'}' + (R2 delete) + ) + ) + define deriv as ( + [substring] among ( + 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' + (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl + 'arcacht' 'arcachta{i'}' 'arcachta' + (<- 'arc') // monarcacht -> monarc + 'gineach' 'gineas' 'ginis' + (<- 'gin') + 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' + (<- 'graf') + 'paite' 'patach' 'pataigh' 'patacha' + (<- 'paite') + '{o'}ideach' '{o'}ideacha' '{o'}idigh' + (<- '{o'}id') + ) + ) + define verb_sfx as ( + [substring] among ( + 'imid' 'aimid' '{i'}mid' 'a{i'}mid' + 'faidh' 'fidh' + (RV delete) + 'ain' + 'eadh' 'adh' + '{a'}il' + 'tear' 'tar' + (R1 delete) + ) + ) +) + +define stem as ( + do initial_morph + do mark_regions + backwards ( + do noun_sfx + do deriv + do verb_sfx + ) +) diff --git a/contrib/snowball/algorithms/italian.sbl b/contrib/snowball/algorithms/italian.sbl new file mode 100644 index 0000000..bf0c161 --- /dev/null +++ b/contrib/snowball/algorithms/italian.sbl @@ -0,0 +1,195 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v AEIO CG ) + +stringescapes {} + +/* special characters */ + +stringdef a' '{U+00E1}' +stringdef a` '{U+00E0}' +stringdef e' '{U+00E9}' +stringdef e` '{U+00E8}' +stringdef i' '{U+00ED}' +stringdef i` '{U+00EC}' +stringdef o' '{U+00F3}' +stringdef o` '{U+00F2}' +stringdef u' '{U+00FA}' +stringdef u` '{U+00F9}' + +define v 'aeiou{a`}{e`}{i`}{o`}{u`}' + +define prelude as ( + test repeat ( + [substring] among( + '{a'}' (<- '{a`}') + '{e'}' (<- '{e`}') + '{i'}' (<- '{i`}') + '{o'}' (<- '{o`}') + '{u'}' (<- '{u`}') + 'qu' (<- 'qU') + '' (next) + ) + ) + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'ci' 'gli' 'la' 'le' 'li' 'lo' + 'mi' 'ne' 'si' 'ti' 'vi' + // the compound forms are: + 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' + 'mela' 'mele' 'meli' 'melo' 'mene' + 'tela' 'tele' 'teli' 'telo' 'tene' + 'cela' 'cele' 'celi' 'celo' 'cene' + 'vela' 'vele' 'veli' 'velo' 'vene' + ) + among( (RV) + 'ando' 'endo' (delete) + 'ar' 'er' 'ir' (<- 'e') + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' + 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' + 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' + 'atrice' 'atrici' + 'ante' 'anti' // Note 1 + ( R2 delete ) + 'azione' 'azioni' 'atore' 'atori' + ( R2 delete + try ( ['ic'] R2 delete ) + ) + 'logia' 'logie' + ( R2 <- 'log' ) + 'uzione' 'uzioni' 'usione' 'usioni' + ( R2 <- 'u' ) + 'enza' 'enze' + ( R2 <- 'ente' ) + 'amento' 'amenti' 'imento' 'imenti' + ( RV delete ) + 'amente' ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' ( ['at'] R2 delete ) + 'os' 'ic' 'abil' + ) + ) + ) + 'it{a`}' ( + R2 delete + try ( + [substring] among( + 'abil' 'ic' 'iv' (R2 delete) + ) + ) + ) + 'ivo' 'ivi' 'iva' 'ive' ( + R2 delete + try ( ['at'] R2 delete ['ic'] R2 delete ) + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' + 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' + 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' + 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' + 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' + 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' + 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' + 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' + 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' + 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' + 'ono' 'uta' 'ute' 'uti' 'uto' + + 'ar' 'ir' // but 'er' is problematical + (delete) + ) + ) + + define AEIO 'aeio{a`}{e`}{i`}{o`}' + define CG 'cg' + + define vowel_suffix as ( + try ( + [AEIO] RV delete + ['i'] RV delete + ) + try ( + ['h'] CG RV delete + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do attached_pronoun + do (standard_suffix or verb_suffix) + do vowel_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ + diff --git a/contrib/snowball/algorithms/kraaij_pohlmann.sbl b/contrib/snowball/algorithms/kraaij_pohlmann.sbl new file mode 100644 index 0000000..409bf0f --- /dev/null +++ b/contrib/snowball/algorithms/kraaij_pohlmann.sbl @@ -0,0 +1,240 @@ +strings ( ch ) +integers ( p1 p2 ) +booleans ( Y_found stemmed GE_removed ) + +routines ( + + R1 R2 + C V VX + lengthen_V + Step_1 Step_2 Step_3 Step_4 Step_7 + Step_6 Step_1c + Lose_prefix + Lose_infix + measure +) + +externals ( stem ) + +groupings ( v v_WX AOU AIOU ) + +stringescapes {} + +define v 'aeiouy' +define v_WX v + 'wx' +define AOU 'aou' +define AIOU 'aiou' + +backwardmode ( + + define R1 as ($p1 <= cursor) + define R2 as ($p2 <= cursor) + + define V as test (v or 'ij') + define VX as test (next v or 'ij') + define C as test (not 'ij' non-v) + + define lengthen_V as do ( + non-v_WX [ (AOU] test (non-v or atlimit)) or + ('e'] test (non-v or atlimit + not AIOU + not (next AIOU non-v))) + ->ch insert ch + ) + + define Step_1 as + ( + [substring] among ( + + '{'}s' (delete) + 's' (R1 not ('t' R1) C delete) + 'ies' (R1 <-'ie') + 'es' + (('ar' R1 C ] delete lengthen_V) or + ('er' R1 C ] delete) or + (R1 C <-'e')) + + 'aus' (R1 V <-'au') + 'en' (('hed' R1 ] <-'heid') or + ('nd' delete) or + ('d' R1 C ] delete) or + ('i' or 'j' V delete) or + (R1 C delete lengthen_V)) + 'nde' (<-'nd') + ) + ) + + define Step_2 as + ( + [substring] among ( + 'je' (('{'}t' ] delete) or + ('et' ] R1 C delete) or + ('rnt' ] <-'rn') or + ('t' ] R1 VX delete) or + ('ink' ] <-'ing') or + ('mp' ] <-'m') or + ('{'}' ] R1 delete) or + (] R1 C delete)) + 'ge' (R1 <-'g') + 'lijke'(R1 <-'lijk') + 'ische'(R1 <-'isch') + 'de' (R1 C delete) + 'te' (R1 <-'t') + 'se' (R1 <-'s') + 're' (R1 <-'r') + 'le' (R1 delete attach 'l' lengthen_V) + 'ene' (R1 C delete attach 'en' lengthen_V) + 'ieve' (R1 C <-'ief') + ) + ) + + define Step_3 as + ( + [substring] among ( + 'atie' (R1 <-'eer') + 'iteit' (R1 delete lengthen_V) + 'heid' + 'sel' + 'ster' (R1 delete) + 'rder' (<-'r') + 'ing' + 'isme' + 'erij' (R1 delete lengthen_V) + 'arij' (R1 C <-'aar') + 'fie' (R2 delete attach 'f' lengthen_V) + 'gie' (R2 delete attach 'g' lengthen_V) + 'tst' (R1 C <-'t') + 'dst' (R1 C <-'d') + ) + ) + + define Step_4 as + ( + ( [substring] among ( + 'ioneel' (R1 <-'ie') + 'atief' (R1 <-'eer') + 'baar' (R1 delete) + 'naar' (R1 V <-'n') + 'laar' (R1 V <-'l') + 'raar' (R1 V <-'r') + 'tant' (R1 <-'teer') + 'lijker' + 'lijkst' (R1 <-'lijk') + 'achtig' + 'achtiger' + 'achtigst'(R1 delete) + 'eriger' + 'erigst' + 'erig' + 'end' (R1 C delete lengthen_V) + ) + ) + or + ( [substring] among ( + 'iger' + 'igst' + 'ig' (R1 C delete lengthen_V) + ) + ) + ) + + define Step_7 as + ( + [substring] among ( + 'kt' (<-'k') + 'ft' (<-'f') + 'pt' (<-'p') + ) + ) + + define Step_6 as + ( + [substring] among ( + 'bb' (<-'b') + 'cc' (<-'c') + 'dd' (<-'d') + 'ff' (<-'f') + 'gg' (<-'g') + 'hh' (<-'h') + 'jj' (<-'j') + 'kk' (<-'k') + 'll' (<-'l') + 'mm' (<-'m') + 'nn' (<-'n') + 'pp' (<-'p') + 'qq' (<-'q') + 'rr' (<-'r') + 'ss' (<-'s') + 'tt' (<-'t') + 'vv' (<-'v') + 'ww' (<-'w') + 'xx' (<-'x') + 'zz' (<-'z') + 'v' (<-'f') + 'z' (<-'s') + ) + ) + + define Step_1c as + ( + [substring] among ( (R1 C) + 'd' (not ('n' R1) delete) + 't' (not ('h' R1) delete) + ) + ) +) + +define Lose_prefix as ( + ['ge'] test hop 3 (goto v goto non-v) + set GE_removed + delete +) + +define Lose_infix as ( + next + gopast (['ge']) test hop 3 (goto v goto non-v) + set GE_removed + delete +) + +define measure as ( + $p1 = limit + $p2 = limit + do( + repeat non-v atleast 1 ('ij' or v) non-v setmark p1 + repeat non-v atleast 1 ('ij' or v) non-v setmark p2 + ) + +) +define stem as ( + + unset Y_found + unset stemmed + do ( ['y'] <-'Y' set Y_found ) + do repeat(goto (v ['y'])<-'Y' set Y_found ) + + measure + + backwards ( + do (Step_1 set stemmed ) + do (Step_2 set stemmed ) + do (Step_3 set stemmed ) + do (Step_4 set stemmed ) + ) + unset GE_removed + do (Lose_prefix and measure) + backwards ( + do (GE_removed Step_1c) + ) + unset GE_removed + do (Lose_infix and measure) + backwards ( + do (GE_removed Step_1c) + ) + backwards ( + do (Step_7 set stemmed ) + do (stemmed or GE_removed Step_6) + ) + do(Y_found repeat(goto (['Y']) <-'y')) +) + diff --git a/contrib/snowball/algorithms/lithuanian.sbl b/contrib/snowball/algorithms/lithuanian.sbl new file mode 100644 index 0000000..ff7fdb9 --- /dev/null +++ b/contrib/snowball/algorithms/lithuanian.sbl @@ -0,0 +1,373 @@ +externals ( stem ) + +// escape symbols for substituting lithuanian characters +stringescapes { } + +/* Special characters in Unicode Latin Extended-A */ +// ' nosine +stringdef a' '{U+0105}' // ą a + ogonek +stringdef e' '{U+0119}' // ę e + ogonek +stringdef i' '{U+012F}' // į i + ogonek +stringdef u' '{U+0173}' // ų u + ogonek + +// . taskas +stringdef e. '{U+0117}' // ė e + dot + +// - ilgoji +stringdef u- '{U+016B}' // ū u + macron + +// * varnele +stringdef c* '{U+010D}' // č c + caron (haček) +stringdef s* '{U+0161}' // š s + caron (haček) +stringdef z* '{U+017E}' // ž z + caron (haček) + +// [C](VC)^m[V|C] +// definitions of variables for +// p1 - position of m = 0 +integers ( p1 ) + +// groupings +// v - lithuanian vowels +groupings ( v ) + +// v - all lithuanian vowels +define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}' + +// all lithuanian stemmer routines: 4 steps +routines ( + step2 R1 step1 fix_chdz fix_gd fix_conflicts +) + +backwardmode ( + + define R1 as $p1 <= cursor + define step1 as ( + setlimit tomark p1 for ([substring]) R1 among ( + // Daiktavardžiai (Nouns) + // I linksniuotė (declension I) + 'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys + 'o' 'io' // vyro, kelio + 'ui' 'iui' // vyrui, keliui + '{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį + 'u' 'iu' // vyru, keliu + 'e' 'yje' // vyre, kelyje + 'y' 'au' 'i' // kely, brolau, broli, + 'an' // nusižengiman + + 'ai' 'iai' // vyrai, keliai + '{u'}' 'i{u'}' // vyrų, kelių + 'ams' 'am' // vyrams, vyram + 'iams' 'iam' // broliams, broliam + 'us' 'ius' // vyrus, brolius + 'ais' 'iais' // vyrais, keliais + 'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos + 'uosna' 'iuosna' // vyruosna, keliuosna + 'ysna' // žutysna + + 'asis' 'aisi' // sukimasis, sukimaisi + 'osi' '{u'}si' // sukimosi, sukimųsi + 'uisi' // sukimuisi + '{a'}si' // sukimąsi + 'usi' // sukimusi + 'esi' // sukimesi + + 'uo' // mėnuo + + + // II linksniuote (declension II) + 'a' 'ia' // galva, vysnios + 'os' 'ios' // galvos, vysnios + 'oj' 'oje' 'ioje' // galvoje, vysnioje + 'osna' 'iosna' // galvosna, vyšniosna + 'om' 'oms' 'ioms' // galvoms, vysnioms + 'omis' 'iomis' // galvomis, vysniomis + 'ose' 'iose' // galvose, vysniose + 'on' 'ion' // galvon, vyšnion + + + // III linksniuote (declension III) + '{e.}' // gervė + '{e.}s' // gervės + 'ei' // gervei + '{e'}' // gervę + '{e.}j' '{e.}je' // gervėj, gervėje + '{e.}ms' // gervėms + 'es' // gerves + '{e.}mis' // gervėmis + '{e.}se' // gervėse + '{e.}sna' // gervėsna + '{e.}n' // žydaitėn + + + // IV linksniuote (declension IV) + 'aus' 'iaus' // sūnaus, skaičiaus + 'umi' 'iumi' // sūnumi, skaičiumi + 'uje' 'iuje' // sūnuje, skaičiuje + 'iau' // skaičiau + + '{u-}s' // sūnūs + 'ums' // sūnums + 'umis' // sūnumis + 'un' 'iun' // sūnun, administratoriun + + + // V linksniuote (declension V) + 'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers + 'eniui' 'eriai' // vandeniui, eriai + 'en{i'}' 'er{i'}' // vandenį, seserį + 'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria + 'enyje' 'eryje' // vandenyje, seseryje + 'ie' 'enie' 'erie' // avie, vandenie, seserie + + 'enys' 'erys' // vandenys, seserys + // 'en{u'}' konfliktas su 'žandenų' 'antenų' + 'er{u'}' // seserų + 'ims' 'enims' 'erims' // avims, vandemins, seserims + 'enis' // vandenis + 'imis' // žebenkštimis + 'enimis' // vandenimis + 'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse + + + // Būdvardžiai (Adjectives) + // (i)a linksniuotė + 'iem' 'iems' // geriem, geriems + 'ame' 'iame' // naujame, mediniame + + + // Veiksmažodžiai (Verbs) + // Tiesioginė nuosaka (indicative mood) + // esamasis laikas (present tense) + // (i)a asmenuotė (declension (i)a) + 'uosi' 'iuosi' // dirbuosi, traukiuosi + 'iesi' // dirbiesi + 'asi' 'iasi' // dirbasi, traukiasi + 'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės + 'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate + 'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės + + // i asmenuotė (declension i) + 'isi' // tikisi + 'im' // mylim + // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime' + 'im{e.}s' // tikimės + 'it' 'ite' // mylit, mylite, tikitės + // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės + + // o asmenuotė (declension o) + 'ome' // mokome + 'ot' 'ote' // mokot, mokote + + // būtasis laikas + // o asmenuotė (declension o) + '{e.}jo' '{e.}josi' // tikėjo, tikėjosi + 'ot{e.}s' // tikėjotės/bijotės + + // ė asmenuotė (declension ė) + 'eisi' // mokeisi + '{e.}si' // mokėsi + '{e.}m' '{e.}me' // mokėm, mokėme + '{e.}m{e.}s' // mokėmės + '{e.}t' '{e.}te' // mokėt, mokėte + '{e.}t{e.}s' // mokėtės + + // būtasis dažninis laikas (frequentative past tense) + 'ausi' // mokydavausi + 'om{e.}s' // mokydavomės/bijomės + + + // būsimasis laikas (future tense) + 'siu' 'siuosi' // dirbsiu, mokysiuosi + 'si' 'siesi' // dirbsi, dirbsiesi + 's' 'ysis' // dirbs, mokysis + 'sim' 'sime' // dirbsim, dirbsime + 'sit' 'site' // gersit, gersite + + // tariamoji nuosaka (subjunctive mood) + '{c*}iau' '{c*}iausi' // dirbčiau + 'tum' 'tumei' // dirbtum, dirbtumei + 'tumeis' 'tumeisi' // mokytumeis, mokytumeisi + // 't{u'}' nes blogai batutų -> batų + 't{u'}si' // mokytųsi + // 'tume' konfliktas su 'šventume' + 'tum{e.}m' // dirbtumėm + 'tum{e.}me' // dirbtumėme + 'tum{e.}m{e.}s' // mokytumėmės + 'tute' 'tum{e.}t' // dirbtute, dirbtumėt + 'tum{e.}te' // dirbtumėte + 'tum{e.}t{e.}s' // mokytumėtės + + // liepiamoji nuosaka (imperative mood) + 'k' 'ki' // dirbk, dirbki, mokykis + // 'kis' konfliktas viln-išk-is + // 'kime' konfliktas, nes pirkime + 'kim{e.}s' // mokykimės + + // bendratis (infinitive) + 'uoti' 'iuoti' // meluoti, dygsniuoti + 'auti' 'iauti' // draugauti, girtuokliauti + 'oti' 'ioti' // dovanoti, meškerioti + '{e.}ti' // auklėti + 'yti' // akyti + 'inti' // auginti + 'in{e.}ti' // blusinėti + 'enti' // gyventi + 'tel{e.}ti' // bumbtelėti + 'ter{e.}ti' // bumbterėti + + 'ti' // skalbti + // 'tis' konfliktas, nes rytme-tis -> rytme + + // dalyviai (participles) + '{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs + 't{u'}s' // suktųs -> suk + 'sim{e.}s' // suksimės + 'sit{e.}s' // suksitės + 'kite' // supkite + ) + + delete + ) + + define step2 as repeat ( + setlimit tomark p1 for ([substring]) among ( + // daiktavardziu priesagos (Noun suffixes) + + // budvardziu priesagos (Adjective suffixes) + // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is + 'ing' // tvark-ing-as + 'i{s*}k' // lenk-išk-as + '{e.}t' // dem-ėt-as + 'ot' // garban-ot-as + 'uot' 'iuot' // lang-uot-as, akin-iuot-as + // 'tin', nes augintinis // dirb-tin-is + // 'ut', nes batutas, degutas etc. // maž-ut-is + 'yt' // maž-yt-is + 'iuk' // maž-iuk-as + 'iul' // maž-ul-is + '{e.}l' // maž-ėl-is + 'yl' // maž-yl-is + 'u{c*}iuk' // maž-učiuk-as + 'uliuk' // maž-uliuk-as + 'ut{e.}ait' // maž-utėlait-is + 'ok' // did-ok-as + 'iok' // višč-iok-as + 'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as + 'op' 'iop' // dvej-op-as, viener-iop-as + 'ain' // apval-ain-as + 'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias + + // laisniai + 'esn' // did-esn-is + 'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias + + // ivardziuotiniai budvardziai (Pronominal adjectives) + // vyriska gimine (Male gender) + 'ias' // žaliasis + 'oj' 'ioj' // gerojo, žaliojo + 'aj' 'iaj' // gerajam, žaliajam + '{a'}j' 'i{a'}j' // garąjį, žaliąjį + 'uoj' 'iuoj' // geruoju, žaliuoju + 'iej' // gerieji + '{u'}j' 'i{u'}j' // gerųjų, žaliųjų + 'ies' // geriesiems + 'uos' 'iuos' // geruosius, žaliuosius + 'ais' 'iais' // geraisiais, žaliaisiais + + // moteriska gimine (Female gender) + 'os' 'ios' // gerosios, žaliosios + '{a'}s' 'i{a'}s' // gerąsios, žaliąsias + + // būtasis dažninis laikas (frequentative past tense) + 'dav' // ei-dav-o + + // dalyvių priesagos (particple suffix) + 'ant' 'iant' + 'int' // tur-int-is + '{e.}j' // tur-ėj-o + '{e'}' // + '{e.}j{e'}' + '{e'}s' // dirb-ęs-is + + 'siant' // dirb-siant + + // pusdalyviai (participle) + 'dam' // bėg-dam-as + + 'auj' // ūkinink-auj-a + 'jam' + 'iau' + 'am' // baiminim-ams-i + ) + + delete + ) + + define fix_conflicts as ( + [substring] among ( + // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite' + 'aite' (<-'ait{e.}') + // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės' + 'ait{e.}s' (<-'ait{e.}') + + // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės' + 'uot{e.}s' (<-'uot{e.}') + // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote' + 'uote' (<-'uot{e.}') + + // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime' + '{e.}jime' (<-'{e.}jimas') + + // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu' + 'esiu' (<-'esys') + // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu' + 'asius' (<-'asys') + + // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime' + 'avime' (<-'avimas') + 'ojime' (<-'ojimas') + + // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės' + 'okat{e.}s' (<-'okat{e.}') + // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate' + 'okate' (<-'okat{e.}') + ) + ) + + define fix_chdz as ( + [substring] among ( + '{c*}' (<-'t') + 'd{z*}' (<-'d') + ) + ) + + define fix_gd as ( + [substring] among ( + 'gd' (<-'g') + // '{e.}k' (<-'{e.}g') + ) + ) + +) + +define stem as ( + + $p1 = limit + + do ( + // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'. + try (test 'a' $(len > 6) hop 1) + + gopast v gopast non-v setmark p1 + ) + + backwards ( + do fix_conflicts + do step1 + do fix_chdz + do step2 + do fix_chdz + do fix_gd + ) + +) diff --git a/contrib/snowball/algorithms/lovins.sbl b/contrib/snowball/algorithms/lovins.sbl new file mode 100644 index 0000000..3f69f15 --- /dev/null +++ b/contrib/snowball/algorithms/lovins.sbl @@ -0,0 +1,208 @@ + +stringescapes {} + +routines ( + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC + + endings + + undouble respell +) + +externals ( stem ) + +backwardmode ( + + /* Lovins' conditions A, B ... CC, as given in her Appendix B, where + a test for a two letter prefix ('test hop 2') is implicitly + assumed. Note that 'e' next 'u' corresponds to her u*e because + Snowball is scanning backwards. */ + + define A as ( hop 2 ) + define B as ( hop 3 ) + define C as ( hop 4 ) + define D as ( hop 5 ) + define E as ( test hop 2 not 'e' ) + define F as ( test hop 3 not 'e' ) + define G as ( test hop 3 'f' ) + define H as ( test hop 2 't' or 'll' ) + define I as ( test hop 2 not 'o' not 'e' ) + define J as ( test hop 2 not 'a' not 'e' ) + define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) + define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) + define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) + define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) + define O as ( test hop 2 'l' or 'i' ) + define P as ( test hop 2 not 'c' ) + define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) + define R as ( test hop 2 'n' or 'r' ) + define S as ( test hop 2 'dr' or ('t' not 't') ) + define T as ( test hop 2 's' or ('t' not 'o') ) + define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) + define V as ( test hop 2 'c' ) + define W as ( test hop 2 not 's' not 'u' ) + define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) + define Y as ( test hop 2 'in' ) + define Z as ( test hop 2 not 'f' ) + define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' + 'es' 't' ) ) + define BB as ( test hop 3 not 'met' not 'ryst' ) + define CC as ( test hop 2 'l' ) + + + /* The system of endings, as given in Appendix A. */ + + define endings as ( + [substring] among( + 'alistically' B 'arizability' A 'izationally' B + + 'antialness' A 'arisations' A 'arizations' A 'entialness' A + + 'allically' C 'antaneous' A 'antiality' A 'arisation' A + 'arization' A 'ationally' B 'ativeness' A 'eableness' E + 'entations' A 'entiality' A 'entialize' A 'entiation' A + 'ionalness' A 'istically' A 'itousness' A 'izability' A + 'izational' A + + 'ableness' A 'arizable' A 'entation' A 'entially' A + 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A + 'ionality' A 'ionalize' A 'iousness' A 'izations' A + 'lessness' A + + 'ability' A 'aically' A 'alistic' B 'alities' A + 'ariness' E 'aristic' A 'arizing' A 'ateness' A + 'atingly' A 'ational' B 'atively' A 'ativism' A + 'elihood' E 'encible' A 'entally' A 'entials' A + 'entiate' A 'entness' A 'fulness' A 'ibility' A + 'icalism' A 'icalist' A 'icality' A 'icalize' A + 'ication' G 'icianry' A 'ination' A 'ingness' A + 'ionally' A 'isation' A 'ishness' A 'istical' A + 'iteness' A 'iveness' A 'ivistic' A 'ivities' A + 'ization' F 'izement' A 'oidally' A 'ousness' A + + 'aceous' A 'acious' B 'action' G 'alness' A + 'ancial' A 'ancies' A 'ancing' B 'ariser' A + 'arized' A 'arizer' A 'atable' A 'ations' B + 'atives' A 'eature' Z 'efully' A 'encies' A + 'encing' A 'ential' A 'enting' C 'entist' A + 'eously' A 'ialist' A 'iality' A 'ialize' A + 'ically' A 'icance' A 'icians' A 'icists' A + 'ifully' A 'ionals' A 'ionate' D 'ioning' A + 'ionist' A 'iously' A 'istics' A 'izable' E + 'lessly' A 'nesses' A 'oidism' A + + 'acies' A 'acity' A 'aging' B 'aical' A + 'alist' A 'alism' B 'ality' A 'alize' A + 'allic'BB 'anced' B 'ances' B 'antic' C + 'arial' A 'aries' A 'arily' A 'arity' B + 'arize' A 'aroid' A 'ately' A 'ating' I + 'ation' B 'ative' A 'ators' A 'atory' A + 'ature' E 'early' Y 'ehood' A 'eless' A + 'elity' A 'ement' A 'enced' A 'ences' A + 'eness' E 'ening' E 'ental' A 'ented' C + 'ently' A 'fully' A 'ially' A 'icant' A + 'ician' A 'icide' A 'icism' A 'icist' A + 'icity' A 'idine' I 'iedly' A 'ihood' A + 'inate' A 'iness' A 'ingly' B 'inism' J + 'inity'CC 'ional' A 'ioned' A 'ished' A + 'istic' A 'ities' A 'itous' A 'ively' A + 'ivity' A 'izers' F 'izing' F 'oidal' A + 'oides' A 'otide' A 'ously' A + + 'able' A 'ably' A 'ages' B 'ally' B + 'ance' B 'ancy' B 'ants' B 'aric' A + 'arly' K 'ated' I 'ates' A 'atic' B + 'ator' A 'ealy' Y 'edly' E 'eful' A + 'eity' A 'ence' A 'ency' A 'ened' E + 'enly' E 'eous' A 'hood' A 'ials' A + 'ians' A 'ible' A 'ibly' A 'ical' A + 'ides' L 'iers' A 'iful' A 'ines' M + 'ings' N 'ions' B 'ious' A 'isms' B + 'ists' A 'itic' H 'ized' F 'izer' F + 'less' A 'lily' A 'ness' A 'ogen' A + 'ward' A 'wise' A 'ying' B 'yish' A + + 'acy' A 'age' B 'aic' A 'als'BB + 'ant' B 'ars' O 'ary' F 'ata' A + 'ate' A 'eal' Y 'ear' Y 'ely' E + 'ene' E 'ent' C 'ery' E 'ese' A + 'ful' A 'ial' A 'ian' A 'ics' A + 'ide' L 'ied' A 'ier' A 'ies' P + 'ily' A 'ine' M 'ing' N 'ion' Q + 'ish' C 'ism' B 'ist' A 'ite'AA + 'ity' A 'ium' A 'ive' A 'ize' F + 'oid' A 'one' R 'ous' A + + 'ae' A 'al'BB 'ar' X 'as' B + 'ed' E 'en' F 'es' E 'ia' A + 'ic' A 'is' A 'ly' B 'on' S + 'or' T 'um' U 'us' V 'yl' R + '{'}s' A 's{'}' A + + 'a' A 'e' A 'i' A 'o' A + 's' W 'y' B + + (delete) + ) + ) + + /* Undoubling is rule 1 of appendix C. */ + + define undouble as ( + test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' + 'tt') + [next] delete + ) + + /* The other appendix C rules can be done together. */ + + define respell as ( + [substring] among ( + 'iev' (<-'ief') + 'uct' (<-'uc') + 'umpt' (<-'um') + 'rpt' (<-'rb') + 'urs' (<-'ur') + 'istr' (<-'ister') + 'metr' (<-'meter') + 'olv' (<-'olut') + 'ul' (not 'a' not 'i' not 'o' <-'l') + 'bex' (<-'bic') + 'dex' (<-'dic') + 'pex' (<-'pic') + 'tex' (<-'tic') + 'ax' (<-'ac') + 'ex' (<-'ec') + 'ix' (<-'ic') + 'lux' (<-'luc') + 'uad' (<-'uas') + 'vad' (<-'vas') + 'cid' (<-'cis') + 'lid' (<-'lis') + 'erid' (<-'eris') + 'pand' (<-'pans') + 'end' (not 's' <-'ens') + 'ond' (<-'ons') + 'lud' (<-'lus') + 'rud' (<-'rus') + 'her' (not 'p' not 't' <-'hes') + 'mit' (<-'mis') + 'ent' (not 'm' <-'ens') + /* 'ent' was 'end' in the 1968 paper - a typo. */ + 'ert' (<-'ers') + 'et' (not 'n' <-'es') + 'yt' (<-'ys') + 'yz' (<-'ys') + ) + ) +) + +define stem as ( + + backwards ( + do endings + do undouble + do respell + ) +) + diff --git a/contrib/snowball/algorithms/nepali.sbl b/contrib/snowball/algorithms/nepali.sbl new file mode 100644 index 0000000..d388748 --- /dev/null +++ b/contrib/snowball/algorithms/nepali.sbl @@ -0,0 +1,92 @@ +/* + * Authors: + * - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group + * - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd. + * - Shreeya Singh Dhakal, Nepali NLP Group + */ + +routines ( + remove_category_1 + check_category_2 + remove_category_2 + remove_category_3 +) + +stringescapes {} + +stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU +stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA +stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I +stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II +stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E +stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA +stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA +stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA +stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA +stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA +stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA +stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA +stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA +stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA +stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA +stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA +stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA +stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA +stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA +stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA +stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA +stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA +stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA +stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA +stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I +stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II +stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U +stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU +stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E +stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI +stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O +stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU +stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA + +externals ( stem ) +backwardmode ( + define remove_category_1 as( + [substring] among ( + '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' + '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' + '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' + (delete) + '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete) + ) + ) + + define check_category_2 as( + [substring] among( + '{dsc}' '{dsa}' '{dvsai}' + ) + ) + + define remove_category_2 as ( + [substring] among( + '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) + '{dvsai}' ('{dlta}{dsv}{dlr}' delete) + ) + ) + + define remove_category_3 as( + [substring] among( + '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' + (delete) + ) + ) + +) + +define stem as ( + backwards ( + do remove_category_1 + do ( + repeat (do (check_category_2 and remove_category_2) remove_category_3) + ) + ) +) diff --git a/contrib/snowball/algorithms/norwegian.sbl b/contrib/snowball/algorithms/norwegian.sbl new file mode 100644 index 0000000..39f4aff --- /dev/null +++ b/contrib/snowball/algorithms/norwegian.sbl @@ -0,0 +1,80 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters */ + +stringdef ae '{U+00E6}' +stringdef ao '{U+00E5}' +stringdef o/ '{U+00F8}' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'bcdfghjlmnoprtvyz' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' + 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' + 'hetens' 'ers' 'ets' 'et' 'het' 'ast' + (delete) + 's' + (s_ending or ('k' non-v) delete) + 'erte' 'ert' + (<-'er') + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'dt' 'vt' + ) + ) + next] delete + ) + + define other_suffix as ( + setlimit tomark p1 for ([substring]) + among( + 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' + 'hetslov' + (delete) + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/porter.sbl b/contrib/snowball/algorithms/porter.sbl new file mode 100644 index 0000000..9533b79 --- /dev/null +++ b/contrib/snowball/algorithms/porter.sbl @@ -0,0 +1,139 @@ +integers ( p1 p2 ) +booleans ( Y_found ) + +routines ( + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b +) + +externals ( stem ) + +groupings ( v v_WXY ) + +define v 'aeiouy' +define v_WXY v + 'wxY' + +backwardmode ( + + define shortv as ( non-v_WXY v non-v ) + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define Step_1a as ( + [substring] among ( + 'sses' (<-'ss') + 'ies' (<-'i') + 'ss' () + 's' (delete) + ) + ) + + define Step_1b as ( + [substring] among ( + 'eed' (R1 <-'ee') + 'ed' + 'ing' ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) + + define Step_1c as ( + ['y' or 'Y'] + gopast v + <-'i' + ) + + define Step_2 as ( + [substring] R1 among ( + 'tional' (<-'tion') + 'enci' (<-'ence') + 'anci' (<-'ance') + 'abli' (<-'able') + 'entli' (<-'ent') + 'eli' (<-'e') + 'izer' 'ization' + (<-'ize') + 'ational' 'ation' 'ator' + (<-'ate') + 'alli' (<-'al') + 'alism' 'aliti' + (<-'al') + 'fulness' (<-'ful') + 'ousli' 'ousness' + (<-'ous') + 'iveness' 'iviti' + (<-'ive') + 'biliti' (<-'ble') + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'alize' (<-'al') + 'icate' 'iciti' 'ical' + (<-'ic') + 'ative' 'ful' 'ness' + (delete) + ) + ) + + define Step_4 as ( + [substring] R2 among ( + 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' + 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' + (delete) + 'ion' ('s' or 't' delete) + ) + ) + + define Step_5a as ( + ['e'] + R2 or (R1 not shortv) + delete + ) + + define Step_5b as ( + ['l'] + R2 'l' + delete + ) +) + +define stem as ( + + unset Y_found + do ( ['y'] <-'Y' set Y_found) + do repeat(goto (v ['y']) <-'Y' set Y_found) + + $p1 = limit + $p2 = limit + do( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) + + backwards ( + do Step_1a + do Step_1b + do Step_1c + do Step_2 + do Step_3 + do Step_4 + do Step_5a + do Step_5b + ) + + do(Y_found repeat(goto (['Y']) <-'y')) + +) diff --git a/contrib/snowball/algorithms/portuguese.sbl b/contrib/snowball/algorithms/portuguese.sbl new file mode 100644 index 0000000..3fb14f1 --- /dev/null +++ b/contrib/snowball/algorithms/portuguese.sbl @@ -0,0 +1,218 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + verb_suffix + residual_suffix + residual_form +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico +stringdef e' '{U+00E9}' // e-acute +stringdef e^ '{U+00EA}' // e-circumflex +stringdef i' '{U+00ED}' // i-acute +stringdef o^ '{U+00F4}' // o-circumflex +stringdef o' '{U+00F3}' // o-acute +stringdef u' '{U+00FA}' // u-acute +stringdef c, '{U+00E7}' // c-cedilla + +stringdef a~ '{U+00E3}' // a-tilde +stringdef o~ '{U+00F5}' // o-tilde + + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' + +define prelude as repeat ( + [substring] among( + '{a~}' (<- 'a~') + '{o~}' (<- 'o~') + '' (next) + ) //or next +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + 'a~' (<- '{a~}') + 'o~' (<- '{o~}') + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'eza' 'ezas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + '{a'}vel' + '{i'}vel' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amento' 'amentos' + 'imento' 'imentos' + + 'adora' 'ador' 'a{c,}a~o' + 'adoras' 'adores' 'a{c,}o~es' // no -ic test + 'ante' 'antes' '{a^}ncia' // Note 1 + ( + R2 delete + ) + 'logia' + 'logias' + ( + R2 <- 'log' + ) + 'u{c,}a~o' 'u{c,}o~es' + ( + R2 <- 'u' + ) + '{e^}ncia' '{e^}ncias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'avel' + '{i'}vel' (R2 delete) + ) + ) + ) + 'idade' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + 'ira' 'iras' + ( + RV 'e' // -eira -eiras usually non-verbal + <- 'ir' + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' + 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' + 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' + 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' + 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' + 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' + 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' + 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' + 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' + 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' + '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' + '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' + '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' + 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' + 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' + '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' + + 'ira' 'iras' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + ) + ) + + define residual_form as ( + [substring] among( + 'e' '{e'}' '{e^}' + ( RV delete [('u'] test 'g') or + ('i'] test 'c') RV delete ) + '{c,}' (<-'c') + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do ( + ( ( standard_suffix or verb_suffix ) + and do ( ['i'] test 'c' RV delete ) + ) + or residual_suffix + ) + do residual_form + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/romanian.sbl b/contrib/snowball/algorithms/romanian.sbl new file mode 100644 index 0000000..7db9e0a --- /dev/null +++ b/contrib/snowball/algorithms/romanian.sbl @@ -0,0 +1,236 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + step_0 + standard_suffix combo_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +booleans ( standard_suffix_removed ) + +stringescapes {} + +/* special characters */ + +stringdef a^ '{U+00E2}' // a circumflex +stringdef i^ '{U+00EE}' // i circumflex +stringdef a+ '{U+0103}' // a breve +stringdef s, '{U+015F}' // s cedilla +stringdef t, '{U+0163}' // t cedilla + +define v 'aeiou{a^}{i^}{a+}' + +define prelude as ( + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define step_0 as ( + [substring] R1 among( + 'ul' 'ului' + ( delete ) + 'aua' + ( <-'a' ) + 'ea' 'ele' 'elor' + ( <-'e' ) + 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' + ( <-'i') + 'ile' + ( not 'ab' <- 'i' ) + 'atei' + ( <- 'at' ) + 'a{t,}ie' 'a{t,}ia' + ( <- 'a{t,}i' ) + ) + ) + + define combo_suffix as test ( + [substring] R1 ( + among( + /* 'IST'. alternative: include the following + 'alism' 'alisme' + 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( + <- 'al' + ) + */ + 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( + <- 'abil' + ) + 'ibilitate' ( + <- 'ibil' + ) + 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( + <- 'iv' + ) + 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' + 'icator' 'icatori' + 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' + 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( + <- 'ic' + ) + 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' + 'atoare' 'ator' 'atori' + '{a+}toare' '{a+}tor' '{a+}tori' ( + <- 'at' + ) + 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' + 'itoare' 'itor' 'itori' ( + <- 'it' + ) + ) + set standard_suffix_removed + ) + ) + + define standard_suffix as ( + unset standard_suffix_removed + repeat combo_suffix + [substring] R2 ( + among( + + // past participle is treated here, rather than + // as a verb ending: + 'at' 'ata' 'at{a+}' 'ati' 'ate' + 'ut' 'uta' 'ut{a+}' 'uti' 'ute' + 'it' 'ita' 'it{a+}' 'iti' 'ite' + + 'ic' 'ica' 'ice' 'ici' 'ic{a+}' + 'abil' 'abila' 'abile' 'abili' 'abil{a+}' + 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' + 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' + 'ant' 'anta' 'ante' 'anti' 'ant{a+}' + 'ator' 'atori' + 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' + 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( + delete + ) + 'iune' 'iuni' ( + '{t,}'] <- 't' + ) + 'ism' 'isme' + 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( + <- 'ist' + /* 'IST'. alternative: remove with <- '' */ + ) + ) + set standard_suffix_removed + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + // 'long' infinitive: + 'are' 'ere' 'ire' '{a^}re' + + // gerund: + 'ind' '{a^}nd' + 'indu' '{a^}ndu' + + 'eze' + 'easc{a+}' + // present: + 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' + 'e{s,}te' + '{a+}sc' '{a+}{s,}ti' + '{a+}{s,}te' + + // imperfect: + 'am' 'ai' 'au' + 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' + 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' + + // past: // (not 'ii') + 'ui' + 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' + 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' + 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' + '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' + + // pluferfect: + 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' + 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' + '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' + '{a^}ser{a+}' + 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' + + ( non-v or 'u' delete ) + + // present: + '{a+}m' 'a{t,}i' + 'em' 'e{t,}i' + 'im' 'i{t,}i' + '{a^}m' '{a^}{t,}i' + + // past: + 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' + 'sei' 'se' + + // pluperfect: + 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' + (delete) + ) + ) + + define vowel_suffix as ( + [substring] RV among ( + 'a' 'e' 'i' 'ie' '{a+}' ( delete ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do step_0 + do standard_suffix + do ( standard_suffix_removed or verb_suffix ) + do vowel_suffix + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/russian.sbl b/contrib/snowball/algorithms/russian.sbl new file mode 100644 index 0000000..20de639 --- /dev/null +++ b/contrib/snowball/algorithms/russian.sbl @@ -0,0 +1,221 @@ +stringescapes {} + +/* the 33 Cyrillic letters represented in ASCII characters following the + * conventions of the standard Library of Congress transliteration: */ + +stringdef a '{U+0430}' +stringdef b '{U+0431}' +stringdef v '{U+0432}' +stringdef g '{U+0433}' +stringdef d '{U+0434}' +stringdef e '{U+0435}' +stringdef e" '{U+0451}' +stringdef zh '{U+0436}' +stringdef z '{U+0437}' +stringdef i '{U+0438}' +stringdef i` '{U+0439}' +stringdef k '{U+043A}' +stringdef l '{U+043B}' +stringdef m '{U+043C}' +stringdef n '{U+043D}' +stringdef o '{U+043E}' +stringdef p '{U+043F}' +stringdef r '{U+0440}' +stringdef s '{U+0441}' +stringdef t '{U+0442}' +stringdef u '{U+0443}' +stringdef f '{U+0444}' +stringdef kh '{U+0445}' +stringdef ts '{U+0446}' +stringdef ch '{U+0447}' +stringdef sh '{U+0448}' +stringdef shch '{U+0449}' +stringdef " '{U+044A}' +stringdef y '{U+044B}' +stringdef ' '{U+044C}' +stringdef e` '{U+044D}' +stringdef iu '{U+044E}' +stringdef ia '{U+044F}' + +routines ( mark_regions R2 + perfective_gerund + adjective + adjectival + reflexive + verb + noun + derivational + tidy_up +) + +externals ( stem ) + +integers ( pV p2 ) + +groupings ( v ) + +define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' + +define mark_regions as ( + + $pV = limit + $p2 = limit + do ( + gopast v setmark pV gopast non-v + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define perfective_gerund as ( + [substring] among ( + '{v}' + '{v}{sh}{i}' + '{v}{sh}{i}{s}{'}' + ('{a}' or '{ia}' delete) + '{i}{v}' + '{i}{v}{sh}{i}' + '{i}{v}{sh}{i}{s}{'}' + '{y}{v}' + '{y}{v}{sh}{i}' + '{y}{v}{sh}{i}{s}{'}' + (delete) + ) + ) + + define adjective as ( + [substring] among ( + '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' + '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' + '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' + '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' + '{ia}{ia}' + // and - + '{o}{iu}' // - which is somewhat archaic + '{e}{iu}' // - soft form of {o}{iu} + (delete) + ) + ) + + define adjectival as ( + adjective + + /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. + nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of + errors. Removing im, uem, enn creates too many errors. + */ + + try ( + [substring] among ( + '{e}{m}' // present passive participle + '{n}{n}' // adjective from past passive participle + '{v}{sh}' // past active participle + '{iu}{shch}' '{shch}' // present active participle + ('{a}' or '{ia}' delete) + + //but not '{i}{m}' '{u}{e}{m}' // present passive participle + //or '{e}{n}{n}' // adjective from past passive participle + + '{i}{v}{sh}' '{y}{v}{sh}'// past active participle + '{u}{iu}{shch}' // present active participle + (delete) + ) + ) + + ) + + define reflexive as ( + [substring] among ( + '{s}{ia}' + '{s}{'}' + (delete) + ) + ) + + define verb as ( + [substring] among ( + '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' + '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' + '{n}{y}' '{t}{'}' '{e}{sh}{'}' + + '{n}{n}{o}' + ('{a}' or '{ia}' delete) + + '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' + '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' + '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' + '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' + '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' + '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' + (delete) + /* note the short passive participle tests: + '{n}{a}' '{n}' '{n}{o}' '{n}{y}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' + */ + ) + ) + + define noun as ( + [substring] among ( + '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' + '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' + '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' + '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' + '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' + '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' + (delete) + /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' + omitted - they only occur on 12 words. + */ + ) + ) + + define derivational as ( + [substring] R2 among ( + '{o}{s}{t}' + '{o}{s}{t}{'}' + (delete) + ) + ) + + define tidy_up as ( + [substring] among ( + + '{e}{i`}{sh}' + '{e}{i`}{sh}{e}' // superlative forms + (delete + ['{n}'] '{n}' delete + ) + '{n}' + ('{n}' delete) // e.g. -nno endings + '{'}' + (delete) // with some slight false conflations + ) + ) +) + +define stem as ( + + // Normalise {e"} to {e}. The documentation has long suggested the user + // should do this before calling the stemmer - we now do it for them. + do repeat ( goto (['{e"}']) <- '{e}' ) + + do mark_regions + backwards setlimit tomark pV for ( + do ( + perfective_gerund or + ( try reflexive + adjectival or verb or noun + ) + ) + try([ '{i}' ] delete) + // because noun ending -i{iu} is being treated as verb ending -{iu} + + do derivational + do tidy_up + ) +) diff --git a/contrib/snowball/algorithms/serbian.sbl b/contrib/snowball/algorithms/serbian.sbl new file mode 100644 index 0000000..bddf76b --- /dev/null +++ b/contrib/snowball/algorithms/serbian.sbl @@ -0,0 +1,2378 @@ +/* Stemmer for Serbian language, based on: + * + * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian + * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/ + * + * authors: Stefan Petkovic and Dragan Ivanovic + * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs + * version: 1.0 (20.04.2019) +*/ + +routines ( + cyr_to_lat + prelude + mark_regions + R1 R2 + Step_1 + Step_2 + Step_3 +) + +externals ( stem ) + +integers ( p1 p2 p3 ) + +groupings ( v ca sa rg ) + +stringescapes {} + +/* special characters - Unicode codepoints */ + +/* serbian cyrillic */ + +stringdef cyrA '{U+0430}' +stringdef cyrB '{U+0431}' +stringdef cyrV '{U+0432}' +stringdef cyrG '{U+0433}' +stringdef cyrD '{U+0434}' +stringdef cyrDx '{U+0452}' +stringdef cyrE '{U+0435}' +stringdef cyrZh '{U+0436}' +stringdef cyrZ '{U+0437}' +stringdef cyrI '{U+0438}' +stringdef cyrJ '{U+0458}' +stringdef cyrK '{U+043A}' +stringdef cyrL '{U+043B}' +stringdef cyrLJ '{U+0459}' +stringdef cyrM '{U+043C}' +stringdef cyrN '{U+043D}' +stringdef cyrNJ '{U+045A}' +stringdef cyrO '{U+043E}' +stringdef cyrP '{U+043F}' +stringdef cyrR '{U+0440}' +stringdef cyrS '{U+0441}' +stringdef cyrT '{U+0442}' +stringdef cyrCy '{U+045B}' +stringdef cyrU '{U+0443}' +stringdef cyrF '{U+0444}' +stringdef cyrH '{U+0445}' +stringdef cyrC '{U+0446}' +stringdef cyrCx '{U+0447}' +stringdef cyrDzx '{U+045F}' +stringdef cyrSx '{U+0448}' + +/* serbian latin with diacritics */ + +stringdef cx '{U+010D}' // small c with caron +stringdef cy '{U+0107}' // small c with acute +stringdef zx '{U+017E}' // small z with caron +stringdef sx '{U+0161}' // small s with caron +stringdef dx '{U+0111}' // small d with stroke + +define v 'aeiou' +define sa '{cx}{cy}{zx}{sx}{dx}' +define ca 'bvgdzjklmnprstfhc' + sa +define rg 'r' + + +define cyr_to_lat as ( + + do repeat goto ( + [substring] among ( + '{cyrA}' (<- 'a') + '{cyrB}' (<- 'b') + '{cyrV}' (<- 'v') + '{cyrG}' (<- 'g') + '{cyrD}' (<- 'd') + '{cyrDx}' (<- '{dx}') + '{cyrE}' (<- 'e') + '{cyrZh}' (<- '{zx}') + '{cyrZ}' (<- 'z') + '{cyrI}' (<- 'i') + '{cyrJ}' (<- 'j') + '{cyrK}' (<- 'k') + '{cyrL}' (<- 'l') + '{cyrLJ}' (<- 'lj') + '{cyrM}' (<- 'm') + '{cyrN}' (<- 'n') + '{cyrNJ}' (<- 'nj') + '{cyrO}' (<- 'o') + '{cyrP}' (<- 'p') + '{cyrR}' (<- 'r') + '{cyrS}' (<- 's') + '{cyrT}' (<- 't') + '{cyrCy}' (<- '{cy}') + '{cyrU}' (<- 'u') + '{cyrF}' (<- 'f') + '{cyrH}' (<- 'h') + '{cyrC}' (<- 'c') + '{cyrCx}' (<- '{cx}') + '{cyrDzx}' (<- 'd{zx}') + '{cyrSx}' (<- '{sx}') + ) + ) + +) + +define prelude as ( + + do repeat goto ( + ca ['ije'] ca <- 'e' + ) + + do repeat goto ( + ca ['je'] ca <- 'e' + ) + + do repeat goto ( + ['dj'] <- '{dx}' + ) + +) + +define mark_regions as ( + + $p3 = 0 + + do ( + gopast sa setmark p3 + ) + + $p1 = limit + $p2 = 0 + + do ( + gopast v setmark p1 + ) + do ( + gopast 'r' setmark p2 + $(p1 - p2 > 1) ($p1 = p2) + ) + ($p1 < 2) ( + ($p1 == p2 gopast 'r' gopast non-rg) or ($p1 != p2 gopast v gopast non-v) + setmark p1 + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p3 == 0 + + define Step_1 as ( + [substring] among ( + 'lozi' + 'lozima' (<-'loga') + 'pesi' + 'pesima' (<-'peh') + 'vojci' (<-'vojka') + 'bojci' (<-'bojka') + 'jaci' + 'jacima' (<-'jak') + '{cx}ajan' (<-'{cx}ajni') + 'cajan' (R2 <-'cajni') + 'eran' (<-'erni') + 'laran' (<-'larni') + 'esan' (<-'esni') + 'anjac' (<-'anjca') + 'ajac' + 'ajaca' (<-'ajca') + 'ljaca' + 'ljac' (<-'ljca') + 'ejac' + 'ejaca' (<-'ejca') + 'ojac' + 'ojaca' (<-'ojca') + 'ajaka' (<-'ajka') + 'ojaka' (<-'ojka') + '{sx}aca' + '{sx}ac' (<-'{sx}ca') + 'inzima' + 'inzi' (<-'ing') + 'tvenici' (<-'tvenik') + 'tetici' + 'teticima' (<-'tetika') + 'nstava' (<-'nstva') + 'nicima' (<-'nik') + 'ticima' (<-'tik') + 'zicima' (<-'zik') + 'snici' (<-'snik') + 'kuse' (<-'kusi') + 'kusan' (<-'kusni') + 'kustava' (<-'kustva') + 'du{sx}an' (<-'du{sx}ni') + 'dusan' (R2 <-'dusni') + 'antan' (<-'antni') + 'bilan' (<-'bilni') + 'tilan' (<-'tilni') + 'avilan' (<-'avilni') + 'silan' (<-'silni') + 'gilan' (<-'gilni') + 'rilan' (<-'rilni') + 'nilan' (<-'nilni') + 'alan' (<-'alni') + 'ozan' (<-'ozni') + 'rave' (<-'ravi') + 'stavan' (<-'stavni') + 'pravan' (<-'pravni') + 'tivan' (<-'tivni') + 'sivan' (<-'sivni') + 'atan' (<-'atni') + 'enat' (<-'enta') + 'tetan' (<-'tetni') + 'pletan' (<-'pletni') + '{sx}ave' (<-'{sx}avi') + 'save' (R2 <-'savi') + 'anata' (<-'anta') + 'a{cx}ak' + 'a{cx}aka' (<-'a{cx}ka') + 'acak' + 'acaka' (R2 <-'acka') + 'u{sx}ak' (<-'u{sx}ka') + 'usak' (R2 <-'uska') + 'atak' + 'ataka' + 'atci' + 'atcima' (<-'atka') + 'etak' + 'etaka' (<-'etka') + 'itak' + 'itaka' + 'itci' (<-'itka') + 'otak' + 'otaka' (<-'otka') + 'utak' + 'utaka' + 'utci' + 'utcima' (<-'utka') + 'eskan' (<-'eskna') + 'ti{cx}an' (<-'ti{cx}ni') + 'tican' (R2 <-'ticni') + 'ojsci' (<-'ojska') + 'esama' (<-'esma') + 'metar' + 'metara' (<-'metra') + 'centar' + 'centara' (<-'centra') + 'istar' + 'istara' (<-'istra') + 'o{sx}{cy}u' (<-'osti') + 'oscu' (R2 <-'osti') + 'daba' (<-'dba') + '{cx}cima' + '{cx}ci' (<-'{cx}ka') + 'mac' + 'maca' (<-'mca') + 'naca' + 'nac' (<-'nca') + 'voljan' (<-'voljni') + 'anaka' (<-'anki') + 'vac' + 'vaca' (<-'vca') + 'saca' + 'sac' (<-'sca') + 'raca' + 'rac' (<-'rca') + 'aoca' + 'alaca' + 'alac' (<-'alca') + 'elaca' + 'elac' (<-'elca') + 'olaca' + 'olac' + 'olce' (<-'olca') + 'njac' + 'njaca' (<-'njca') + 'ekata' + 'ekat' (<-'ekta') + 'izam' + 'izama' (<-'izma') + 'jebe' (<-'jebi') + 'baci' (<-'baci') + 'a{sx}an' (<-'a{sx}ni') + 'asan' (R2 <-'asni') + ) + ) + + define Step_2 as ( + [substring] R1 among ( + 'skijima' + 'skijega' + 'skijemu' + 'skijem' + 'skega' + 'skemu' + 'skem' + 'skijim' + 'skijih' + 'skijoj' + 'skijeg' + 'skiji' + 'skije' + 'skija' + 'skoga' + 'skome' + 'skomu' + 'skima' + 'skog' + 'skom' + 'skim' + 'skih' + 'skoj' + 'ski' + 'ske' + 'sko' + 'ska' + 'sku' (<-'sk') + '{sx}kijima' + '{sx}kijega' + '{sx}kijemu' + '{sx}kijem' + '{sx}kega' + '{sx}kemu' + '{sx}kem' + '{sx}kijim' + '{sx}kijih' + '{sx}kijoj' + '{sx}kijeg' + '{sx}kiji' + '{sx}kije' + '{sx}kija' + '{sx}koga' + '{sx}kome' + '{sx}komu' + '{sx}kima' + '{sx}kog' + '{sx}kom' + '{sx}kim' + '{sx}kih' + '{sx}koj' + '{sx}ki' + '{sx}ke' + '{sx}ko' + '{sx}ka' + '{sx}ku' (<-'{sx}k') + 'stvima' + 'stvom' + 'stvo' + 'stva' + 'stvu' (<-'stv') + '{sx}tvima' + '{sx}tvom' + '{sx}tvo' + '{sx}tva' + '{sx}tvu' (<-'{sx}tv') + 'tanijama' + 'tanijima' + 'tanijom' + 'tanija' + 'taniju' + 'tanije' + 'taniji' (<-'tanij') + 'manijama' + 'manijima' + 'manijom' + 'manija' + 'maniju' + 'manije' + 'maniji' (<-'manij') + 'panijama' + 'panijima' + 'panijom' + 'panija' + 'paniju' + 'panije' + 'paniji' (<-'panij') + 'ranijama' + 'ranijima' + 'ranijom' + 'ranija' + 'raniju' + 'ranije' + 'raniji' (<-'ranij') + 'ganijama' + 'ganijima' + 'ganijom' + 'ganija' + 'ganiju' + 'ganije' + 'ganiji' (<-'ganij') + 'aninom' + 'anina' + 'aninu' + 'anine' + 'anima' + 'anin' + 'anom' + 'anu' + 'ani' + 'ana' + 'ane' (<-'an') + 'inima' + 'inama' + 'inom' + 'ina' + 'ine' + 'ini' + 'inu' + 'ino' (<-'in') + 'onovima' + 'onova' + 'onove' + 'onovi' + 'onima' + 'onom' + 'ona' + 'one' + 'oni' + 'onu' (<-'on') + 'nijima' + 'nijega' + 'nijemu' + 'nijeg' + 'nijem' + 'nega' + 'nemu' + 'neg' + 'nem' + 'nijim' + 'nijih' + 'nijoj' + 'niji' + 'nije' + 'nija' + 'niju' + 'nima' + 'nome' + 'nomu' + 'noga' + 'noj' + 'nom' + 'nih' + 'nim' + 'nog' + 'no' + 'ne' + 'na' + 'nu' + 'ni' (<-'n') + 'a{cy}oga' + 'a{cy}ome' + 'a{cy}omu' + 'a{cy}ega' + 'a{cy}emu' + 'a{cy}ima' + 'a{cy}oj' + 'a{cy}ih' + 'a{cy}om' + 'a{cy}eg' + 'a{cy}em' + 'a{cy}og' + 'a{cy}uh' + 'a{cy}im' + 'a{cy}e' + 'a{cy}a' (<-'a{cy}') + 'e{cy}oga' + 'e{cy}ome' + 'e{cy}omu' + 'e{cy}ega' + 'e{cy}emu' + 'e{cy}ima' + 'e{cy}oj' + 'e{cy}ih' + 'e{cy}om' + 'e{cy}eg' + 'e{cy}em' + 'e{cy}og' + 'e{cy}uh' + 'e{cy}im' + 'e{cy}e' + 'e{cy}a' (<-'e{cy}') + 'u{cy}oga' + 'u{cy}ome' + 'u{cy}omu' + 'u{cy}ega' + 'u{cy}emu' + 'u{cy}ima' + 'u{cy}oj' + 'u{cy}ih' + 'u{cy}om' + 'u{cy}eg' + 'u{cy}em' + 'u{cy}og' + 'u{cy}uh' + 'u{cy}im' + 'u{cy}e' + 'u{cy}a' (<-'u{cy}') + 'ugovima' + 'ugovi' + 'ugove' + 'ugova' (<-'ugov') + 'ugama' + 'ugom' + 'uga' + 'uge' + 'ugi' + 'ugu' + 'ugo' (<-'ug') + 'logama' + 'logom' + 'loga' + 'logu' + 'loge' (<-'log') + 'govima' + 'gama' + 'govi' + 'gove' + 'gova' + 'gom' + 'ga' + 'ge' + 'gi' + 'gu' + 'go' (<-'g') + 'rarijem' + 'rarija' + 'rariju' + 'rario' (<-'rari') + 'otijem' + 'otija' + 'otiju' + 'otio' (<-'oti') + 'sijem' + 'sija' + 'siju' + 'sio' (<-'si') + 'lijem' + 'lija' + 'liju' + 'lio' (<-'li') + 'uju{cy}i' + 'ujemo' + 'ujete' + 'ujmo' + 'ujem' + 'uje{sx}' + 'uje' + 'uju' (<-'uj') + 'cajevima' + 'cajevi' + 'cajeva' + 'cajeve' + 'cajama' + 'cajima' + 'cajem' + 'caja' + 'caje' + 'caji' + 'caju' (<-'caj') + '{cx}ajevima' + '{cx}ajevi' + '{cx}ajeva' + '{cx}ajeve' + '{cx}ajama' + '{cx}ajima' + '{cx}ajem' + '{cx}aja' + '{cx}aje' + '{cx}aji' + '{cx}aju' (<-'{cx}aj') + '{cy}ajevima' + '{cy}ajevi' + '{cy}ajeva' + '{cy}ajeve' + '{cy}ajama' + '{cy}ajima' + '{cy}ajem' + '{cy}aja' + '{cy}aje' + '{cy}aji' + '{cy}aju' (<-'{cy}aj') + '{dx}ajevima' + '{dx}ajevi' + '{dx}ajeva' + '{dx}ajeve' + '{dx}ajama' + '{dx}ajima' + '{dx}ajem' + '{dx}aja' + '{dx}aje' + '{dx}aji' + '{dx}aju' (<-'{dx}aj') + 'lajevima' + 'lajevi' + 'lajeva' + 'lajeve' + 'lajama' + 'lajima' + 'lajem' + 'laja' + 'laje' + 'laji' + 'laju' (<-'laj') + 'rajevima' + 'rajevi' + 'rajeva' + 'rajeve' + 'rajama' + 'rajima' + 'rajem' + 'raja' + 'raje' + 'raji' + 'raju' (<-'raj') + 'bijima' + 'bijama' + 'bijom' + 'bija' + 'bije' + 'biji' + 'biju' + 'bijo' (<-'bij') + 'cijima' + 'cijama' + 'cijom' + 'cija' + 'cije' + 'ciji' + 'ciju' + 'cijo' (<-'cij') + 'dijima' + 'dijama' + 'dijom' + 'dija' + 'dije' + 'diji' + 'diju' + 'dijo' (<-'dij') + 'lijima' + 'lijama' + 'lijom' + 'lije' + 'liji' + 'lijo' (<-'lij') + 'nijama' + 'nijom' + 'nijo' (<-'nij') + 'mijima' + 'mijama' + 'mijom' + 'mija' + 'mije' + 'miji' + 'miju' + 'mijo' (<-'mij') + '{zx}ijima' + '{zx}ijama' + '{zx}ijom' + '{zx}ija' + '{zx}ije' + '{zx}iji' + '{zx}iju' + '{zx}ijo' (<-'{zx}ij') + 'gijima' + 'gijama' + 'gijom' + 'gija' + 'gije' + 'giji' + 'giju' + 'gijo' (<-'gij') + 'fijima' + 'fijama' + 'fijom' + 'fija' + 'fije' + 'fiji' + 'fiju' + 'fijo' (<-'fij') + 'pijima' + 'pijama' + 'pijom' + 'pija' + 'pije' + 'piji' + 'piju' + 'pijo' (<-'pij') + 'rijima' + 'rijama' + 'rijom' + 'rija' + 'rije' + 'riji' + 'riju' + 'rijo' (<-'rij') + 'sijima' + 'sijama' + 'sijom' + 'sije' + 'siji' + 'sijo' (<-'sij') + 'tijima' + 'tijama' + 'tijom' + 'tija' + 'tije' + 'tiji' + 'tiju' + 'tijo' (<-'tij') + 'zijima' + 'zijama' + 'zijom' + 'zija' + 'zije' + 'ziji' + 'ziju' + 'zijo' (<-'zij') + 'nalima' + 'nalama' + 'nalom' + 'nala' + 'nale' + 'nali' + 'nalu' + 'nalo' (<-'nal') + 'ijalima' + 'ijalama' + 'ijalom' + 'ijala' + 'ijale' + 'ijali' + 'ijalu' + 'ijalo' (<-'ijal') + 'ozilima' + 'ozilom' + 'ozila' + 'ozile' + 'ozilu' + 'ozili' (<-'ozil') + 'olovima' + 'olovi' + 'olova' + 'olove' (<-'olov') + 'olima' + 'olom' + 'ola' + 'olu' + 'ole' + 'oli' (<-'ol') + 'lemama' + 'lemima' + 'lemom' + 'lema' + 'leme' + 'lemi' + 'lemu' + 'lemo' (<-'lem') + 'ramama' + 'ramom' + 'rama' + 'rame' + 'rami' + 'ramu' + 'ramo' (<-'ram') + 'arama' + 'arima' + 'arom' + 'aru' + 'ara' + 'are' + 'ari' (<-'ar') + 'drama' + 'drima' + 'drom' + 'dru' + 'dra' + 'dre' + 'dri' (<-'dr') + 'erama' + 'erima' + 'erom' + 'eru' + 'era' + 'ere' + 'eri' (<-'er') + 'orama' + 'orima' + 'orom' + 'oru' + 'ora' + 'ore' + 'ori' (<-'or') + 'esima' + 'esom' + 'ese' + 'esa' + 'esu' (<-'es') + 'isima' + 'isom' + 'ise' + 'isa' + 'isu' (<-'is') + 'ta{sx}ama' + 'ta{sx}ima' + 'ta{sx}om' + 'ta{sx}em' + 'ta{sx}a' + 'ta{sx}u' + 'ta{sx}i' + 'ta{sx}e' (<-'ta{sx}') + 'na{sx}ama' + 'na{sx}ima' + 'na{sx}om' + 'na{sx}em' + 'na{sx}a' + 'na{sx}u' + 'na{sx}i' + 'na{sx}e' (<-'na{sx}') + 'ja{sx}ama' + 'ja{sx}ima' + 'ja{sx}om' + 'ja{sx}em' + 'ja{sx}a' + 'ja{sx}u' + 'ja{sx}i' + 'ja{sx}e' (<-'ja{sx}') + 'ka{sx}ama' + 'ka{sx}ima' + 'ka{sx}om' + 'ka{sx}em' + 'ka{sx}a' + 'ka{sx}u' + 'ka{sx}i' + 'ka{sx}e' (<-'ka{sx}') + 'ba{sx}ama' + 'ba{sx}ima' + 'ba{sx}om' + 'ba{sx}em' + 'ba{sx}a' + 'ba{sx}u' + 'ba{sx}i' + 'ba{sx}e' (<-'ba{sx}') + 'ga{sx}ama' + 'ga{sx}ima' + 'ga{sx}om' + 'ga{sx}em' + 'ga{sx}a' + 'ga{sx}u' + 'ga{sx}i' + 'ga{sx}e' (<-'ga{sx}') + 'va{sx}ama' + 'va{sx}ima' + 'va{sx}om' + 'va{sx}em' + 'va{sx}a' + 'va{sx}u' + 'va{sx}i' + 'va{sx}e' (<-'va{sx}') + 'e{sx}ima' + 'e{sx}ama' + 'e{sx}om' + 'e{sx}em' + 'e{sx}i' + 'e{sx}e' + 'e{sx}a' + 'e{sx}u' (<-'e{sx}') + 'i{sx}ima' + 'i{sx}ama' + 'i{sx}om' + 'i{sx}em' + 'i{sx}i' + 'i{sx}e' + 'i{sx}a' + 'i{sx}u' (<-'i{sx}') + 'ikatima' + 'ikatom' + 'ikata' + 'ikate' + 'ikati' + 'ikatu' + 'ikato' (<-'ikat') + 'latima' + 'latom' + 'lata' + 'late' + 'lati' + 'latu' + 'lato' (<-'lat') + 'etama' + 'etima' + 'etom' + 'eta' + 'ete' + 'eti' + 'etu' + 'eto' (<-'et') + 'estima' + 'estama' + 'estom' + 'esta' + 'este' + 'esti' + 'estu' + 'esto' (<-'est') + 'istima' + 'istama' + 'istom' + 'ista' + 'iste' + 'isti' + 'istu' + 'isto' (<-'ist') + 'kstima' + 'kstama' + 'kstom' + 'ksta' + 'kste' + 'ksti' + 'kstu' + 'ksto' (<-'kst') + 'ostima' + 'ostama' + 'ostom' + 'osta' + 'oste' + 'osti' + 'ostu' + 'osto' (<-'ost') + 'i{sx}tima' + 'i{sx}tem' + 'i{sx}ta' + 'i{sx}te' + 'i{sx}tu' (<-'i{sx}t') + 'ovasmo' + 'ovaste' + 'ovahu' + 'ovati' + 'ova{sx}e' + 'ovali' + 'ovala' + 'ovale' + 'ovalo' + 'ovat' + 'ovah' + 'ovao' (<-'ova') + 'avijemu' + 'avijima' + 'avijega' + 'avijeg' + 'avijem' + 'avemu' + 'avega' + 'aveg' + 'avem' + 'avijim' + 'avijih' + 'avijoj' + 'avoga' + 'avome' + 'avomu' + 'avima' + 'avama' + 'aviji' + 'avije' + 'avija' + 'aviju' + 'avim' + 'avih' + 'avoj' + 'avom' + 'avog' + 'avi' + 'ava' + 'avu' + 'ave' + 'avo' (<-'av') + 'evijemu' + 'evijima' + 'evijega' + 'evijeg' + 'evijem' + 'evemu' + 'evega' + 'eveg' + 'evem' + 'evijim' + 'evijih' + 'evijoj' + 'evoga' + 'evome' + 'evomu' + 'evima' + 'evama' + 'eviji' + 'evije' + 'evija' + 'eviju' + 'evim' + 'evih' + 'evoj' + 'evom' + 'evog' + 'evi' + 'eva' + 'evu' + 'eve' + 'evo' (<-'ev') + 'ivijemu' + 'ivijima' + 'ivijega' + 'ivijeg' + 'ivijem' + 'ivemu' + 'ivega' + 'iveg' + 'ivem' + 'ivijim' + 'ivijih' + 'ivijoj' + 'ivoga' + 'ivome' + 'ivomu' + 'ivima' + 'ivama' + 'iviji' + 'ivije' + 'ivija' + 'iviju' + 'ivim' + 'ivih' + 'ivoj' + 'ivom' + 'ivog' + 'ivi' + 'iva' + 'ivu' + 'ive' + 'ivo' (<-'iv') + 'ovijemu' + 'ovijima' + 'ovijega' + 'ovijeg' + 'ovijem' + 'ovemu' + 'ovega' + 'oveg' + 'ovijim' + 'ovijih' + 'ovijoj' + 'ovoga' + 'ovome' + 'ovomu' + 'ovima' + 'oviji' + 'ovije' + 'ovija' + 'oviju' + 'ovim' + 'ovih' + 'ovoj' + 'ovom' + 'ovog' + 'ovi' + 'ova' + 'ovu' + 'ove' + 'ovo' (<-'ov') + 'movima' + 'movom' + 'mova' + 'movu' + 'move' + 'movi' (<-'mov') + 'lovima' + 'lovom' + 'lova' + 'lovu' + 'love' + 'lovi' (<-'lov') + 'elijemu' + 'elijima' + 'elijega' + 'elijeg' + 'elijem' + 'elemu' + 'elega' + 'eleg' + 'elem' + 'elijim' + 'elijih' + 'elijoj' + 'eloga' + 'elome' + 'elomu' + 'elima' + 'eliji' + 'elije' + 'elija' + 'eliju' + 'elim' + 'elih' + 'eloj' + 'elom' + 'elog' + 'eli' + 'ela' + 'elu' + 'ele' + 'elo' (<-'el') + 'anjijemu' + 'anjijima' + 'anjijega' + 'anjijeg' + 'anjijem' + 'anjemu' + 'anjega' + 'anjeg' + 'anjem' + 'anjijim' + 'anjijih' + 'anjijoj' + 'anjoga' + 'anjome' + 'anjomu' + 'anjima' + 'anjiji' + 'anjije' + 'anjija' + 'anjiju' + 'anjim' + 'anjih' + 'anjoj' + 'anjom' + 'anjog' + 'anja' + 'anje' + 'anji' + 'anjo' + 'anju' (<-'anj') + 'enjijemu' + 'enjijima' + 'enjijega' + 'enjijeg' + 'enjijem' + 'enjemu' + 'enjega' + 'enjeg' + 'enjem' + 'enjijim' + 'enjijih' + 'enjijoj' + 'enjoga' + 'enjome' + 'enjomu' + 'enjima' + 'enjiji' + 'enjije' + 'enjija' + 'enjiju' + 'enjim' + 'enjih' + 'enjoj' + 'enjom' + 'enjog' + 'enja' + 'enje' + 'enji' + 'enjo' + 'enju' (<-'enj') + '{sx}njijemu' + '{sx}njijima' + '{sx}njijega' + '{sx}njijeg' + '{sx}njijem' + '{sx}njemu' + '{sx}njega' + '{sx}njeg' + '{sx}njem' + '{sx}njijim' + '{sx}njijih' + '{sx}njijoj' + '{sx}njoga' + '{sx}njome' + '{sx}njomu' + '{sx}njima' + '{sx}njiji' + '{sx}njije' + '{sx}njija' + '{sx}njiju' + '{sx}njim' + '{sx}njih' + '{sx}njoj' + '{sx}njom' + '{sx}njog' + '{sx}nja' + '{sx}nje' + '{sx}nji' + '{sx}njo' + '{sx}nju' (<-'{sx}nj') + 'anemu' + 'anega' + 'aneg' + 'anem' (<-'an') + 'enemu' + 'enega' + 'eneg' + 'enem' (<-'en') + '{sx}nemu' + '{sx}nega' + '{sx}neg' + '{sx}nem' (<-'{sx}n') + '{cx}inama' + '{cx}inome' + '{cx}inomu' + '{cx}inoga' + '{cx}inima' + '{cx}inog' + '{cx}inom' + '{cx}inim' + '{cx}inih' + '{cx}inoj' + '{cx}ina' + '{cx}inu' + '{cx}ini' + '{cx}ino' + '{cx}ine' (<-'{cx}in') + 'ro{sx}iv{sx}i' + 'ro{sx}ismo' + 'ro{sx}iste' + 'ro{sx}i{sx}e' + 'ro{sx}imo' + 'ro{sx}ite' + 'ro{sx}iti' + 'ro{sx}ili' + 'ro{sx}ila' + 'ro{sx}ilo' + 'ro{sx}ile' + 'ro{sx}im' + 'ro{sx}i{sx}' + 'ro{sx}it' + 'ro{sx}ih' + 'ro{sx}io' (<-'ro{sx}i') + 'o{sx}ijemu' + 'o{sx}ijima' + 'o{sx}ijega' + 'o{sx}ijeg' + 'o{sx}ijem' + 'o{sx}emu' + 'o{sx}ega' + 'o{sx}eg' + 'o{sx}em' + 'o{sx}ijim' + 'o{sx}ijih' + 'o{sx}ijoj' + 'o{sx}oga' + 'o{sx}ome' + 'o{sx}omu' + 'o{sx}ima' + 'o{sx}iji' + 'o{sx}ije' + 'o{sx}ija' + 'o{sx}iju' + 'o{sx}im' + 'o{sx}ih' + 'o{sx}oj' + 'o{sx}om' + 'o{sx}og' + 'o{sx}i' + 'o{sx}a' + 'o{sx}u' + 'o{sx}e' (<-'o{sx}') + 'evitijima' + 'evitijega' + 'evitijemu' + 'evitijem' + 'evitega' + 'evitemu' + 'evitem' + 'evitijim' + 'evitijih' + 'evitijoj' + 'evitijeg' + 'evitiji' + 'evitije' + 'evitija' + 'evitoga' + 'evitome' + 'evitomu' + 'evitima' + 'evitog' + 'evitom' + 'evitim' + 'evitih' + 'evitoj' + 'eviti' + 'evite' + 'evito' + 'evita' + 'evitu' (<-'evit') + 'ovitijima' + 'ovitijega' + 'ovitijemu' + 'ovitijem' + 'ovitega' + 'ovitemu' + 'ovitem' + 'ovitijim' + 'ovitijih' + 'ovitijoj' + 'ovitijeg' + 'ovitiji' + 'ovitije' + 'ovitija' + 'ovitoga' + 'ovitome' + 'ovitomu' + 'ovitima' + 'ovitog' + 'ovitom' + 'ovitim' + 'ovitih' + 'ovitoj' + 'oviti' + 'ovite' + 'ovito' + 'ovita' + 'ovitu' (<-'ovit') + 'astijima' + 'astijega' + 'astijemu' + 'astijem' + 'astega' + 'astemu' + 'astem' + 'astijim' + 'astijih' + 'astijoj' + 'astijeg' + 'astiji' + 'astije' + 'astija' + 'astoga' + 'astome' + 'astomu' + 'astima' + 'astog' + 'astom' + 'astim' + 'astih' + 'astoj' + 'asti' + 'aste' + 'asto' + 'asta' + 'astu' (<-'ast') + 'kijemu' + 'kijima' + 'kijega' + 'kijeg' + 'kijem' + 'kemu' + 'kega' + 'keg' + 'kem' + 'kijim' + 'kijih' + 'kijoj' + 'koga' + 'kome' + 'komu' + 'kima' + 'kiji' + 'kije' + 'kija' + 'kiju' + 'kim' + 'kih' + 'koj' + 'kom' + 'kog' + 'kov' + 'ki' + 'ka' + 'ku' + 'ke' + 'ko' (<-'k') + 'evaju{cy}i' + 'evasmo' + 'evaste' + 'evajmo' + 'evajte' + 'evaju' + 'evala' + 'evale' + 'evali' + 'evalo' + 'evamo' + 'evana' + 'evane' + 'evani' + 'evano' + 'evate' + 'evati' + 'eva{sx}e' + 'evahu' + 'evah' + 'evaj' + 'evam' + 'evan' + 'evao' + 'evat' + 'evav' + 'eva{sx}' (<-'eva') + 'avaju{cy}i' + 'avasmo' + 'avaste' + 'avajmo' + 'avajte' + 'avaju' + 'avala' + 'avale' + 'avali' + 'avalo' + 'avamo' + 'avana' + 'avane' + 'avani' + 'avano' + 'avate' + 'avati' + 'ava{sx}e' + 'avahu' + 'avah' + 'avaj' + 'avam' + 'avan' + 'avao' + 'avat' + 'avav' + 'ava{sx}' (<-'ava') + 'ivaju{cy}i' + 'ivasmo' + 'ivaste' + 'ivajmo' + 'ivajte' + 'ivaju' + 'ivala' + 'ivale' + 'ivali' + 'ivalo' + 'ivamo' + 'ivana' + 'ivane' + 'ivani' + 'ivano' + 'ivate' + 'ivati' + 'iva{sx}e' + 'ivahu' + 'ivah' + 'ivaj' + 'ivam' + 'ivan' + 'ivao' + 'ivat' + 'ivav' + 'iva{sx}' (<-'iva') + 'uvaju{cy}i' + 'uvasmo' + 'uvaste' + 'uvajmo' + 'uvajte' + 'uvaju' + 'uvala' + 'uvale' + 'uvali' + 'uvalo' + 'uvamo' + 'uvana' + 'uvane' + 'uvani' + 'uvano' + 'uvate' + 'uvati' + 'uva{sx}e' + 'uvahu' + 'uvah' + 'uvaj' + 'uvam' + 'uvan' + 'uvao' + 'uvat' + 'uvav' + 'uva{sx}' (<-'uva') + 'irujemo' + 'irujete' + 'iruju{cy}i' + 'iraju{cy}i' + 'irivat' + 'irujem' + 'iruje{sx}' + 'irujmo' + 'irujte' + 'irav{sx}i' + 'irasmo' + 'iraste' + 'irati' + 'iramo' + 'irate' + 'iraju' + 'ira{sx}e' + 'irahu' + 'irala' + 'iralo' + 'irali' + 'irale' + 'iruje' + 'iruju' + 'iruj' + 'iral' + 'iran' + 'iram' + 'ira{sx}' + 'irat' + 'irah' + 'irao' (<-'ir') + 'a{cx}ismo' + 'a{cx}iste' + 'a{cx}iti' + 'a{cx}imo' + 'a{cx}ite' + 'a{cx}i{sx}e' + 'a{cx}e{cy}i' + 'a{cx}ila' + 'a{cx}ilo' + 'a{cx}ili' + 'a{cx}ile' + 'a{cx}ena' + 'a{cx}eno' + 'a{cx}eni' + 'a{cx}ene' + 'a{cx}io' + 'a{cx}im' + 'a{cx}i{sx}' + 'a{cx}it' + 'a{cx}ih' + 'a{cx}en' + 'a{cx}i' + 'a{cx}e' (<-'a{cx}') + 'a{cx}av{sx}i' + 'a{cx}asmo' + 'a{cx}aste' + 'a{cx}ahu' + 'a{cx}ati' + 'a{cx}amo' + 'a{cx}ate' + 'a{cx}a{sx}e' + 'a{cx}ala' + 'a{cx}alo' + 'a{cx}ali' + 'a{cx}ale' + 'a{cx}aju' + 'a{cx}ana' + 'a{cx}ano' + 'a{cx}ani' + 'a{cx}ane' + 'a{cx}ao' + 'a{cx}am' + 'a{cx}a{sx}' + 'a{cx}at' + 'a{cx}ah' + 'a{cx}an' (<-'a{cx}a') + 'nuv{sx}i' + 'nusmo' + 'nuste' + 'nu{cy}i' + 'nimo' + 'nite' + 'nemo' + 'nete' + 'nula' + 'nulo' + 'nule' + 'nuli' + 'nuto' + 'nuti' + 'nuta' + 'ne{sx}' + 'nuo' + 'nut' (<-'n') + 'niv{sx}i' + 'nismo' + 'niste' + 'niti' + 'nila' + 'nilo' + 'nile' + 'nili' + 'ni{sx}' + 'nio' (<-'ni') + 'aju{cy}i' + 'av{sx}i' + 'asmo' + 'ajmo' + 'ajte' + 'ajem' + 'aloj' + 'amo' + 'ate' + 'aje' + 'aju' + 'ati' + 'a{sx}e' + 'ahu' + 'ala' + 'ali' + 'ale' + 'alo' + 'ano' + 'at' + 'ah' + 'ao' + 'aj' + 'an' + 'am' + 'a{sx}' (<-'a') + 'uraju{cy}i' + 'urasmo' + 'uraste' + 'urajmo' + 'urajte' + 'uramo' + 'urate' + 'uraju' + 'urati' + 'ura{sx}e' + 'urahu' + 'urala' + 'urali' + 'urale' + 'uralo' + 'urana' + 'urano' + 'urani' + 'urane' + 'ural' + 'urat' + 'urah' + 'urao' + 'uraj' + 'uran' + 'uram' + 'ura{sx}' (<-'ur') + 'astajasmo' + 'astajaste' + 'astajahu' + 'astajati' + 'astajemo' + 'astajete' + 'astaja{sx}e' + 'astajali' + 'astaju{cy}i' + 'astajala' + 'astajalo' + 'astajale' + 'astajmo' + 'astajao' + 'astajem' + 'astaje{sx}' + 'astajat' + 'astajah' + 'astajte' + 'astaje' + 'astaju' (<-'astaj') + 'istajasmo' + 'istajaste' + 'istajahu' + 'istajati' + 'istajemo' + 'istajete' + 'istaja{sx}e' + 'istajali' + 'istaju{cy}i' + 'istajala' + 'istajalo' + 'istajale' + 'istajmo' + 'istajao' + 'istajem' + 'istaje{sx}' + 'istajat' + 'istajah' + 'istajte' + 'istaje' + 'istaju' (<-'istaj') + 'ostajasmo' + 'ostajaste' + 'ostajahu' + 'ostajati' + 'ostajemo' + 'ostajete' + 'ostaja{sx}e' + 'ostajali' + 'ostaju{cy}i' + 'ostajala' + 'ostajalo' + 'ostajale' + 'ostajmo' + 'ostajao' + 'ostajem' + 'ostaje{sx}' + 'ostajat' + 'ostajah' + 'ostajte' + 'ostaje' + 'ostaju' (<-'ostaj') + 'alama' + 'alima' + 'alom' + 'alu' + 'al' (<-'a') + 'ajevima' + 'ajevi' + 'ajeva' + 'ajeve' + 'ajama' + 'ajima' + 'aja' + 'aji' (<-'aj') + 'astadosmo' + 'astadoste' + 'astado{sx}e' + 'astanemo' + 'astademo' + 'astanete' + 'astadete' + 'astanimo' + 'astanite' + 'astanila' + 'astav{sx}i' + 'astanem' + 'astadem' + 'astane{sx}' + 'astade{sx}' + 'astadoh' + 'astade' + 'astati' + 'astane' + 'astanu' + 'astadu' + 'astala' + 'astali' + 'astalo' + 'astale' + 'astat' + 'astao' (<-'asta') + 'istadosmo' + 'istadoste' + 'istado{sx}e' + 'istanemo' + 'istademo' + 'istanete' + 'istadete' + 'istanimo' + 'istanite' + 'istanila' + 'istav{sx}i' + 'istanem' + 'istadem' + 'istane{sx}' + 'istade{sx}' + 'istadoh' + 'istade' + 'istati' + 'istane' + 'istanu' + 'istadu' + 'istala' + 'istali' + 'istalo' + 'istale' + 'istat' + 'istao' (<-'ista') + 'ostadosmo' + 'ostadoste' + 'ostado{sx}e' + 'ostanemo' + 'ostademo' + 'ostanete' + 'ostadete' + 'ostanimo' + 'ostanite' + 'ostanila' + 'ostav{sx}i' + 'ostanem' + 'ostadem' + 'ostane{sx}' + 'ostade{sx}' + 'ostadoh' + 'ostade' + 'ostati' + 'ostane' + 'ostanu' + 'ostadu' + 'ostala' + 'ostali' + 'ostalo' + 'ostale' + 'ostat' + 'ostao' (<-'osta') + 'tasmo' + 'taste' + 'tajmo' + 'tajte' + 'tav{sx}i' + 'tati' + 'tamo' + 'tate' + 'taju' + 'tala' + 'talo' + 'tale' + 'tali' + 'tana' + 'tano' + 'tani' + 'tane' + 'tan' + 'taj' + 'tao' + 'tam' + 'ta{sx}' + 'tat' + 'tah' (<-'ta') + 'injasmo' + 'injaste' + 'injati' + 'injemo' + 'injete' + 'injali' + 'injala' + 'injalo' + 'injale' + 'inja{sx}e' + 'injahu' + 'injem' + 'inje{sx}' + 'injat' + 'injah' + 'injao' (<-'inj') + 'astemo' + 'astete' + 'astimo' + 'astite' + 'astu{cy}i' + 'aste{sx}' + 'asli' + 'asla' + 'aslo' + 'asle' (<-'as') + 'iv{sx}i' + 'ie{cy}i' + 'ismo' + 'imo' + 'ite' + 'iti' + 'ili' + 'ila' + 'ilo' + 'ile' + 'im' + 'i{sx}' + 'it' + 'ih' + 'io' (<-'i') + 'ijemo' + 'ijete' + 'ijem' + 'ije{sx}' + 'ijmo' + 'ijte' + 'iju' + 'ije' + 'ij' + 'ilu' (<-'i') + 'lu{cx}ujete' + 'lu{cx}uju{cy}i' + 'lu{cx}ujemo' + 'lu{cx}ujem' + 'lu{cx}uje{sx}' + 'lu{cx}ismo' + 'lu{cx}iste' + 'lu{cx}ujmo' + 'lu{cx}ujte' + 'lu{cx}uje' + 'lu{cx}uju' + 'lu{cx}i{sx}e' + 'lu{cx}iti' + 'lu{cx}imo' + 'lu{cx}ite' + 'lu{cx}ila' + 'lu{cx}ilo' + 'lu{cx}ili' + 'lu{cx}ile' + 'lu{cx}ena' + 'lu{cx}eno' + 'lu{cx}eni' + 'lu{cx}ene' + 'lu{cx}uj' + 'lu{cx}io' + 'lu{cx}en' + 'lu{cx}im' + 'lu{cx}i{sx}' + 'lu{cx}it' + 'lu{cx}ih' + 'lu{cx}e' + 'lu{cx}i' (<-'lu{cx}') + 'jetismo' + 'jetiste' + 'jeti{sx}e' + 'jetimo' + 'jetite' + 'jetiti' + 'jetili' + 'jetila' + 'jetilo' + 'jetile' + 'jetim' + 'jeti{sx}' + 'jetit' + 'jetih' + 'jetio' (<-'jeti') + 'emo' + 'em' + 'e{sx}' + 'elama' + 'el' (<-'e') + 'ilama' + 'ilima' + 'ilom' + 'il' (<-'i') + 'atijega' + 'atijemu' + 'atijima' + 'atijeg' + 'atijem' + 'atega' + 'atemu' + 'ateg' + 'atem' + 'atijih' + 'atijim' + 'atima' + 'atoga' + 'atome' + 'atomu' + 'atiji' + 'atije' + 'atija' + 'atiju' + 'atoj' + 'atog' + 'atom' + 'atim' + 'atih' + 'ata' + 'atu' + 'ato' (<-'at') + 'etav{sx}i' + 'etu{cy}i' + 'etemo' + 'etimo' + 'etem' + 'ete{sx}' (<-'et') + 'lucujuci' + 'lucujemo' + 'lucujete' + 'lucujem' + 'lucujes' + 'lucujmo' + 'lucujte' + 'lucismo' + 'luciste' + 'luciti' + 'lucite' + 'lucise' + 'lucuje' + 'lucuju' + 'lucila' + 'lucile' + 'lucili' + 'lucilo' + 'lucena' + 'luceni' + 'lucene' + 'luceno' + 'lucimo' + 'lucim' + 'lucis' + 'lucih' + 'lucit' + 'lucio' + 'lucuj' + 'lucen' + 'luce' + 'luci' (R2 <-'luc') + 'snjijima' + 'snjijemu' + 'snjijega' + 'snjijim' + 'snjijih' + 'snjijeg' + 'snjijoj' + 'snjiji' + 'snjija' + 'snjije' + 'snjiju' + 'snjima' + 'snjemu' + 'snjomu' + 'snjome' + 'snjega' + 'snjoga' + 'snjih' + 'snjim' + 'snjem' + 'snjom' + 'snjeg' + 'snjog' + 'snjoj' + 'snja' + 'snje' + 'snji' + 'snjo' + 'snju' (R2 <-'snj') + 'osijima' + 'osijemu' + 'osijega' + 'snjijem' + 'osijih' + 'osijim' + 'osijem' + 'osijeg' + 'osijoj' + 'osima' + 'osemu' + 'osomu' + 'osome' + 'osega' + 'osoga' + 'osija' + 'osije' + 'osiji' + 'osiju' + 'osih' + 'osim' + 'osem' + 'osom' + 'oseg' + 'osog' + 'osoj' + 'osa' + 'ose' + 'osi' + 'osu' (R2 <-'os') + 'acismo' + 'aciste' + 'acima' + 'acimo' + 'acome' + 'acomu' + 'acite' + 'aciti' + 'acise' + 'acila' + 'acile' + 'acili' + 'acilo' + 'acega' + 'acene' + 'aceci' + 'aceni' + 'acemu' + 'acena' + 'aceno' + 'acoga' + 'acoj' + 'acih' + 'acem' + 'acom' + 'acen' + 'acog' + 'acit' + 'acio' + 'aceg' + 'acim' + 'acuh' + 'acis' + 'ace' + 'aca' + 'aci' (R2 <-'ac') + 'ecome' + 'ecoga' + 'ecemu' + 'ecima' + 'ecega' + 'ecomu' + 'ecoj' + 'ecuh' + 'ecom' + 'ecog' + 'eceg' + 'ecih' + 'ecem' + 'ecim' + 'eca' + 'ece' (R2 <-'ec') + 'ucomu' + 'ucome' + 'ucima' + 'ucoga' + 'ucega' + 'ucemu' + 'ucih' + 'ucog' + 'uceg' + 'ucom' + 'ucem' + 'ucim' + 'ucuh' + 'ucoj' + 'uca' + 'uce' (R2 <-'uc') + 'rosismo' + 'rosivsi' + 'rosiste' + 'rositi' + 'rosili' + 'rosise' + 'rosite' + 'rosilo' + 'rosimo' + 'rosile' + 'rosila' + 'rosit' + 'rosis' + 'rosio' + 'rosim' + 'rosih' (R2 <-'rosi') + 'acavsi' + 'acaste' + 'acasmo' + 'acaju' + 'acane' + 'acate' + 'acali' + 'acani' + 'acati' + 'acale' + 'acahu' + 'acase' + 'acano' + 'acamo' + 'acalo' + 'acana' + 'acala' + 'acam' + 'acan' + 'acao' + 'acas' + 'acat' + 'acah' (R2 <-'aca') + 'jasima' + 'jasama' + 'jasem' + 'jasom' + 'jase' + 'jasi' + 'jasa' + 'jasu' (R2 <-'jas') + 'tasima' + 'tasama' + 'tasem' + 'tasom' + 'tase' + 'tasa' + 'tasu' + 'tasi' (R2 <-'tas') + 'gasima' + 'gasama' + 'gasem' + 'gasom' + 'gasi' + 'gasu' + 'gase' + 'gasa' (R2 <-'gas') + 'nasama' + 'nasima' + 'nasem' + 'nasom' + 'nasu' + 'nasi' + 'nase' + 'nasa' (R2 <-'nas') + 'kasama' + 'kasima' + 'kasom' + 'kasem' + 'kasi' + 'kasu' + 'kase' + 'kasa' (R2 <-'kas') + 'vasama' + 'vasima' + 'vasom' + 'vasem' + 'vasi' + 'vase' + 'vasa' + 'vasu' (R2 <-'vas') + 'basama' + 'basima' + 'basom' + 'basem' + 'basi' + 'base' + 'basu' + 'basa' (R2 <-'bas') + 'astuci' + 'astes' (R2 <-'as') + 'cinima' + 'cinome' + 'cinama' + 'cinomu' + 'cinoga' + 'cinom' + 'cinih' + 'cinim' + 'cinog' + 'cinoj' + 'cino' + 'cini' + 'cinu' + 'cine' + 'cina' (R2 <-'cin') + 'astajase' + 'astajuci' + 'astajes' (R2 <-'astaj') + 'istajase' + 'istajuci' + 'istajes' (R2 <-'istaj') + 'ostajase' + 'ostajuci' + 'ostajes' (R2 <-'ostaj') + 'astadose' + 'astades' + 'astanes' + 'astavsi' (R2 <-'asta') + 'istadose' + 'istades' + 'istanes' + 'istavsi' (R2 <-'ista') + 'ostadose' + 'ostades' + 'ostanes' + 'ostavsi' (R2 <-'osta') + 'avajuci' + 'avase' + 'avas' (R2 <-'ava') + 'evajuci' + 'evase' + 'evas' (R2 <-'eva') + 'ivajuci' + 'ivase' + 'ivas' (R2 <-'iva') + 'uvajuci' + 'uvase' + 'uvas' (R2 <-'uva') + 'ovase' (R2 <-'ova') + 'jetise' + 'jetis' (R2 <-'jeti') + 'injase' + 'injes' (R2 <-'inj') + 'istem' (R2 <-'ist') + 'esama' + 'esem' + 'esi' (R2 <-'es') + 'etavsi' + 'etuci' + 'etes' (R2 <-'et') + 'isama' + 'isem' + 'isi' (R2 <-'is') + 'irajuci' + 'irujuci' + 'irujes' + 'iravsi' + 'irase' + 'iras' (R2 <-'ir') + 'urajuci' + 'urase' + 'uras' (R2 <-'ur') + 'ujuci' + 'ujes' (R2 <-'uj') + 'nivsi' + 'nis' (R2 <-'ni') + 'snega' + 'snemu' + 'snem' + 'sneg' (R2 <-'sn') + 'tavsi' + 'tas' (R2 <-'ta') + 'ajuci' + 'avsi' + 'ase' + 'as' (R2 <-'a') + 'ijes' + 'ivsi' + 'ieci' + 'is' (R2 <-'i') + 'es' (R2 <-'e') + 'nuvsi' + 'nuci' + 'nes' (R2 <-'n') + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'enom' + 'enoj' + 'enog' + 'enim' + 'enih' + 'anoj' + 'anog' + 'anim' + 'anih' + 'ost' + 'eno' + 'eni' + 'oga' + 'ima' + 'enu' + 'ena' + 'ama' + 'ano' + 'ani' + 'om' + 'og' + 'u' + 'o' + 'i' + 'e' + 'a' (<-'') + ) + ) +) + +define stem as ( + do cyr_to_lat + do prelude + do mark_regions + backwards ( + do Step_1 + do (Step_2 or Step_3) + ) +) diff --git a/contrib/snowball/algorithms/spanish.sbl b/contrib/snowball/algorithms/spanish.sbl new file mode 100644 index 0000000..6638f5f --- /dev/null +++ b/contrib/snowball/algorithms/spanish.sbl @@ -0,0 +1,230 @@ +routines ( + postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + y_verb_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef e' '{U+00E9}' // e-acute +stringdef i' '{U+00ED}' // i-acute +stringdef o' '{U+00F3}' // o-acute +stringdef u' '{U+00FA}' // u-acute +stringdef u" '{U+00FC}' // u-diaeresis +stringdef n~ '{U+00F1}' // n-tilde + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + '{i'}' (<- 'i') + '{o'}' (<- 'o') + '{u'}' (<- 'u') + // and possibly {u"}->u here, or in prelude + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' + 'las' 'les' 'los' 'nos' + ) + substring RV among( + 'i{e'}ndo' (] <- 'iendo') + '{a'}ndo' (] <- 'ando') + '{a'}r' (] <- 'ar') + '{e'}r' (] <- 'er') + '{i'}r' (] <- 'ir') + 'ando' + 'iendo' + 'ar' 'er' 'ir' + (delete) + 'yendo' ('u' delete) + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anzas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + 'able' 'ables' + 'ible' 'ibles' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amiento' 'amientos' + 'imiento' 'imientos' + ( + R2 delete + ) + 'adora' 'ador' 'aci{o'}n' + 'adoras' 'adores' 'aciones' + 'ante' 'antes' 'ancia' 'ancias'// Note 1 + ( + R2 delete + try ( ['ic'] R2 delete ) + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + 'encia' 'encias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'able' + 'ible' (R2 delete) + ) + ) + ) + 'idad' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + ) + ) + + define y_verb_suffix as ( + setlimit tomark pV for ([substring]) among( + 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' + 'yas' 'yes' 'yais' 'yamos' + ('u' delete) + ) + ) + + define verb_suffix as ( + setlimit tomark pV for ([substring]) among( + + 'en' 'es' '{e'}is' 'emos' + (try ('u' test 'g') ] delete) + + 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' + 'ar{e'}' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' + + 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' + 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + 'e' '{e'}' + ( RV delete try( ['u'] test 'g' RV delete ) ) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + y_verb_suffix or + verb_suffix + ) + do residual_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/swedish.sbl b/contrib/snowball/algorithms/swedish.sbl new file mode 100644 index 0000000..2cbb885 --- /dev/null +++ b/contrib/snowball/algorithms/swedish.sbl @@ -0,0 +1,72 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters */ + +stringdef a" '{U+00E4}' +stringdef ao '{U+00E5}' +stringdef o" '{U+00F6}' + +define v 'aeiouy{a"}{ao}{o"}' + +define s_ending 'bcdfghjklmnoprtvy' + +define mark_regions as ( + + $p1 = limit + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' + 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' + 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' + 'hetens' 'erns' 'at' 'andet' 'het' 'ast' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as setlimit tomark p1 for ( + among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') + and ([next] delete) + ) + + define other_suffix as setlimit tomark p1 for ( + [substring] among( + 'lig' 'ig' 'els' (delete) + 'l{o"}st' (<-'l{o"}s') + 'fullt' (<-'full') + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/tamil.sbl b/contrib/snowball/algorithms/tamil.sbl new file mode 100644 index 0000000..9635777 --- /dev/null +++ b/contrib/snowball/algorithms/tamil.sbl @@ -0,0 +1,405 @@ +/* +* Affix stripping stemming algorithm for Tamil +* By Damodharan Rajalingam +*/ + +stringescapes {} + +/* Aytham */ +stringdef aytham '{U+0B83}' + +/* Uyir - independent vowels */ +stringdef a '{U+0B85}' +stringdef aa '{U+0B86}' +stringdef i '{U+0B87}' +stringdef ii '{U+0B88}' +stringdef u '{U+0B89}' +stringdef uu '{U+0B8A}' +stringdef e '{U+0B8E}' +stringdef ee '{U+0B8F}' +stringdef ai '{U+0B90}' +stringdef o '{U+0B92}' +stringdef oo '{U+0B93}' +stringdef au '{U+0B94}' + +/* Consonants */ +stringdef ka '{U+0B95}' +stringdef nga '{U+0B99}' +stringdef ca '{U+0B9A}' +stringdef ja '{U+0B9C}' +stringdef nya '{U+0B9E}' +stringdef tta '{U+0B9F}' +stringdef nna '{U+0BA3}' +stringdef ta '{U+0BA4}' +stringdef tha '{U+0BA4}' +stringdef na '{U+0BA8}' +stringdef nnna '{U+0BA9}' +stringdef pa '{U+0BAA}' +stringdef ma '{U+0BAE}' +stringdef ya '{U+0BAF}' +stringdef ra '{U+0BB0}' +stringdef rra '{U+0BB1}' +stringdef la '{U+0BB2}' +stringdef lla '{U+0BB3}' +stringdef llla '{U+0BB4}' +stringdef zha '{U+0BB4}' +stringdef va '{U+0BB5}' + +/* Vatamozi - borrowed */ +stringdef sha '{U+0BB6}' +stringdef ssa '{U+0BB7}' +stringdef sa '{U+0BB8}' +stringdef ha '{U+0BB9}' + + +/* Dependent vowel signs (kombu etc.) */ +stringdef vs_aa '{U+0BBE}' +stringdef vs_i '{U+0BBF}' +stringdef vs_ii '{U+0BC0}' +stringdef vs_u '{U+0BC1}' +stringdef vs_uu '{U+0BC2}' +stringdef vs_e '{U+0BC6}' +stringdef vs_ee '{U+0BC7}' +stringdef vs_ai '{U+0BC8}' +stringdef vs_o '{U+0BCA}' +stringdef vs_oo '{U+0BCB}' +stringdef vs_au '{U+0BCC}' + +/* Pulli */ +stringdef pulli '{U+0BCD}' + +/* AU length markk */ +stringdef au_lmark '{U+0BD7}' + + +routines ( + remove_plural_suffix + remove_question_suffixes + remove_question_prefixes + remove_pronoun_prefixes + remove_command_suffixes + remove_um + remove_vetrumai_urupukal + fix_va_start + fix_ending + fix_endings + remove_tense_suffix + remove_tense_suffixes + remove_common_word_endings + has_min_length +) + +externals ( stem ) + +booleans ( + found_a_match + found_vetrumai_urupu +) + +define has_min_length as ( + $(len > 4) +) + +define fix_va_start as ( + (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or + (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or + (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or + (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) +) + +define fix_endings as ( + do repeat fix_ending +) + +define remove_question_prefixes as ( + [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete + do fix_va_start +) + +// Gives signal t if an ending was fixed, signal f otherwise. +define fix_ending as ( + $(len > 3) + backwards ( + ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) + or + ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) + or + ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) + or + ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) + or +// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) + ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) + or + ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) + or + ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) + or + ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) + or + ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) + or + ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) + or + ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) + or + ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) + or + ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) + or + ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) + or + ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) + or + ( [ '{nga}{pulli}' ] delete ) + or + ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) + ) +) + +define remove_pronoun_prefixes as ( + unset found_a_match + [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete + (set found_a_match) + do fix_va_start +) + +define remove_plural_suffix as ( + unset found_a_match + backwards ( + ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or + ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or + ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or + ( [ '{ka}{lla}{pulli}' ] delete ) + (set found_a_match) + ) +) + +define remove_question_suffixes as ( + has_min_length + unset found_a_match + backwards ( + do ( + [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' + (set found_a_match) + ) + ) + do fix_endings +) + +define remove_command_suffixes as ( + has_min_length + unset found_a_match + backwards ( + [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete + (set found_a_match) + ) +) + +define remove_um as ( + unset found_a_match + has_min_length + backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' + (set found_a_match) + ) + do fix_ending +) + +define remove_common_word_endings as ( + // These are not suffixes actually but are + // some words that are attached to other words + // but can be removed for stemming + unset found_a_match + has_min_length + backwards ( + test ( [ '{vs_u}{tta}{nnna}{pulli}' or + '{vs_i}{la}{pulli}{la}{vs_ai}' or + '{vs_i}{tta}{ma}{pulli}' or + '{vs_i}{nnna}{pulli}{rra}{vs_i}' or + '{vs_aa}{ka}{vs_i}' or + '{vs_aa}{ka}{vs_i}{ya}' or + '{vs_e}{nnna}{pulli}{rra}{vs_u}' or + '{vs_u}{lla}{pulli}{lla}' or + '{vs_u}{tta}{vs_ai}{ya}' or + '{vs_u}{tta}{vs_ai}' or + '{vs_e}{nnna}{vs_u}{ma}{pulli}' or + ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or + '{vs_e}{nnna}' or + '{vs_aa}{ka}{vs_i}' ] <- '{pulli}' + (set found_a_match) + ) + or + test ( [ among('{pa}{tta}{vs_u}' + '{pa}{tta}{pulli}{tta}' + '{pa}{tta}{pulli}{tta}{vs_u}' + '{pa}{tta}{pulli}{tta}{ta}{vs_u}' + '{pa}{tta}{pulli}{tta}{nna}' + '{ka}{vs_u}{ra}{vs_i}{ya}' + '{pa}{rra}{pulli}{rra}{vs_i}' + '{va}{vs_i}{tta}{vs_u}' + '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' + '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' + '{pa}{tta}{vs_i}' + '{ta}{vs_aa}{nnna}' + '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') + ] delete + (set found_a_match) + ) + ) + do fix_endings +) + +define remove_vetrumai_urupukal as ( + unset found_a_match + unset found_vetrumai_urupu + has_min_length + backwards ( + ( + test ( ['{nnna}{vs_ai}'] delete ) + or + test ([ ( '{vs_i}{nnna}{vs_ai}' or + '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or + ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) + ] <- '{pulli}' + ) + or + test ( [ + '{vs_o}{tta}{vs_u}' or + '{vs_oo}{tta}{vs_u}' or + '{vs_i}{la}{pulli}' or + '{vs_i}{rra}{pulli}' or + ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or + '{vs_i}{nnna}{pulli}{rra}{vs_u}' or + '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or + '{va}{vs_i}{tta}' or + ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or + '{vs_aa}{la}{pulli}' or + '{vs_u}{tta}{vs_ai}' or + '{vs_aa}{ma}{la}{pulli}' or + ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or + '{vs_u}{lla}{pulli}' + ] <- '{pulli}' + ) + or + test ( [ + '{ka}{nna}{pulli}' or + '{ma}{vs_u}{nnna}{pulli}' or + '{ma}{vs_ee}{la}{pulli}' or + '{ma}{vs_ee}{rra}{pulli}' or + '{ka}{vs_ii}{llla}{pulli}' or + '{pa}{vs_i}{nnna}{pulli}' or + ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) + ] delete + ) + or + test ([ '{vs_ii}' ] <- '{vs_i}') + ) + (set found_a_match) + (set found_vetrumai_urupu) + do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) + ) + do fix_endings +) + +define remove_tense_suffixes as ( + set found_a_match + repeat ( found_a_match (do remove_tense_suffix) ) +) + +define remove_tense_suffix as ( + unset found_a_match + has_min_length + backwards ( + do ( + test ( [among( + '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' + '{pa}{tta}{vs_u}' + )] delete + (set found_a_match) + ) + or + test ( [ + '{ma}{vs_aa}{ra}{pulli}' or + '{ma}{vs_i}{nnna}{pulli}' or + '{nnna}{nnna}{pulli}' or + '{nnna}{vs_aa}{nnna}{pulli}' or + '{nnna}{vs_aa}{lla}{pulli}' or + '{nnna}{vs_aa}{ra}{pulli}' or + ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or + '{nnna}{lla}{pulli}' or + '{va}{lla}{pulli}' or + '{nnna}{ra}{pulli}' or + '{va}{ra}{pulli}' or + '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or + '{pa}{nnna}{pulli}' or + '{pa}{lla}{pulli}' or + '{pa}{ra}{pulli}' or + ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or + '{vs_i}{rra}{pulli}{rra}{vs_u}' or + '{pa}{ma}{pulli}' or + '{nnna}{ma}{pulli}' or + '{ta}{vs_u}{ma}{pulli}' or + '{rra}{vs_u}{ma}{pulli}' or + '{ka}{vs_u}{ma}{pulli}' or + '{nnna}{vs_e}{nnna}{pulli}' or + '{nnna}{vs_ai}' or + '{va}{vs_ai}' + ] delete + (set found_a_match) + ) + or + test ( [ + ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or + '{vs_aa}{lla}{pulli}' or + '{vs_aa}{ra}{pulli}' or + '{vs_ee}{nnna}{pulli}' or + '{vs_aa}' or + '{vs_aa}{ma}{pulli}' or + '{vs_e}{ma}{pulli}' or + '{vs_ee}{ma}{pulli}' or + '{vs_oo}{ma}{pulli}' or + '{ka}{vs_u}{ma}{pulli}' or + '{ta}{vs_u}{ma}{pulli}' or + '{tta}{vs_u}{ma}{pulli}' or + '{rra}{vs_u}{ma}{pulli}' or + '{vs_aa}{ya}{pulli}' or + '{nnna}{vs_e}{nnna}{pulli}' or + '{nnna}{vs_i}{ra}{pulli}' or + '{vs_ii}{ra}{pulli}' or + '{vs_ii}{ya}{ra}{pulli}' + ] <- '{pulli}' + (set found_a_match) + ) + or + test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete + (set found_a_match) + ) + ) + do ([among( + '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' + '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' + '{ka}{vs_i}{nnna}{pulli}{rra}' + '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' + '{ka}{vs_i}{rra}' + '{ka}{vs_i}{rra}{pulli}' + )] delete + (set found_a_match) + ) + ) + do fix_endings +) + +define stem as ( + unset found_vetrumai_urupu + do fix_ending + has_min_length + do remove_question_prefixes + do remove_pronoun_prefixes + do remove_question_suffixes + do remove_um + do remove_common_word_endings + do remove_vetrumai_urupukal + do remove_plural_suffix + do remove_command_suffixes + do remove_tense_suffixes +) diff --git a/contrib/snowball/algorithms/turkish.sbl b/contrib/snowball/algorithms/turkish.sbl new file mode 100644 index 0000000..eadd61d --- /dev/null +++ b/contrib/snowball/algorithms/turkish.sbl @@ -0,0 +1,470 @@ +/* Stemmer for Turkish + * author: Evren (Kapusuz) Çilden + * email: evren.kapusuz at gmail.com + * version: 1.0 (15.01.2007) + + + * stems nominal verb suffixes + * stems nominal inflections + * more than one syllable word check + * (y,n,s,U) context check + * vowel harmony check + * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) + + * The stemming algorithm is based on the paper "An Affix Stripping + * Morphological Analyzer for Turkish" by Gülşen Eryiğit and + * Eşref Adalı (Proceedings of the IAESTED International Conference + * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, + * Innsbruck, Austria + + * Turkish is an agglutinative language and has a very rich morphological + * structure. In Turkish, you can form many different words from a single stem + * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means + * "You had been the doctor of him". The stem of the word is "doktor" and it + * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about + * the append order of suffixes can be clearly described as FSMs. + * The paper referenced above defines some FSMs for right to left + * morphological analysis. I generated a method for constructing snowball + * expressions from right to left FSMs for stemming suffixes. +*/ + +routines ( + append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings + check_vowel_harmony // tests vowel harmony for suffixes + is_reserved_word // tests whether current string is a reserved word ('ad','soyad') + mark_cAsInA // nominal verb suffix + mark_DA // noun suffix + mark_DAn // noun suffix + mark_DUr // nominal verb suffix + mark_ki // noun suffix + mark_lAr // noun suffix, nominal verb suffix + mark_lArI // noun suffix + mark_nA // noun suffix + mark_ncA // noun suffix + mark_ndA // noun suffix + mark_ndAn // noun suffix + mark_nU // noun suffix + mark_nUn // noun suffix + mark_nUz // nominal verb suffix + mark_sU // noun suffix + mark_sUn // nominal verb suffix + mark_sUnUz // nominal verb suffix + mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, + mark_yA // noun suffix + mark_ylA // noun suffix + mark_yU // noun suffix + mark_yUm // nominal verb suffix + mark_yUz // nominal verb suffix + mark_yDU // nominal verb suffix + mark_yken // nominal verb suffix + mark_ymUs_ // nominal verb suffix + mark_ysA // nominal verb suffix + + mark_suffix_with_optional_y_consonant + mark_suffix_with_optional_U_vowel + mark_suffix_with_optional_n_consonant + mark_suffix_with_optional_s_consonant + + more_than_one_syllable_word + + post_process_last_consonants + postlude + + stem_nominal_verb_suffixes + stem_noun_suffixes + stem_suffix_chain_before_ki +) + +stringescapes { } + +/* Special characters in Unicode Latin-1 and Latin Extended-A */ +stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA +stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE +stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT +stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS +stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA +stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS + +booleans ( continue_stemming_noun_suffixes ) + +groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) + +define vowel 'ae{i'}io{o"}u{u"}' +define U '{i'}iu{u"}' + +// the vowel grouping definitions below are used for checking vowel harmony +define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' +define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' +define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' +define vowel4 'ei' // vowels that can end with suffixes containing 'i' +define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' +define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' + +externals ( stem ) + +backwardmode ( + // checks vowel harmony for possible suffixes, + // helps to detect whether the candidate for suffix applies to vowel harmony + // this rule is added to prevent over stemming + define check_vowel_harmony as ( + test + ( + (goto vowel) // if there is a vowel + ( + ('a' goto vowel1) or + ('e' goto vowel2) or + ('{i'}' goto vowel3) or + ('i' goto vowel4) or + ('o' goto vowel5) or + ('{o"}' goto vowel6) or + ('u' goto vowel5) or + ('{u"}' goto vowel6) + ) + ) + ) + + // if the last consonant before suffix is vowel and n then advance and delete + // if the last consonant before suffix is non vowel and n do nothing + // if the last consonant before suffix is not n then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_n_consonant as ( + ('n' (test vowel)) + or + ((not(test 'n')) test(next vowel)) + + ) + + // if the last consonant before suffix is vowel and s then advance and delete + // if the last consonant before suffix is non vowel and s do nothing + // if the last consonant before suffix is not s then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_s_consonant as ( + ('s' (test vowel)) + or + ((not(test 's')) test(next vowel)) + ) + + // if the last consonant before suffix is vowel and y then advance and delete + // if the last consonant before suffix is non vowel and y do nothing + // if the last consonant before suffix is not y then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_y_consonant as ( + ('y' (test vowel)) + or + ((not(test 'y')) test(next vowel)) + ) + + define mark_suffix_with_optional_U_vowel as ( + (U (test non-vowel)) + or + ((not(test U)) test(next non-vowel)) + + ) + + define mark_possessives as ( + among ('m{i'}z' 'miz' 'muz' 'm{u"}z' + 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') + (mark_suffix_with_optional_U_vowel) + ) + + define mark_sU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_s_consonant) + ) + + define mark_lArI as ( + among ('leri' 'lar{i'}') + ) + + define mark_yU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nU as ( + check_vowel_harmony + among ('n{i'}' 'ni' 'nu' 'n{u"}') + ) + + define mark_nUn as ( + check_vowel_harmony + among ('{i'}n' 'in' 'un' '{u"}n') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yA as ( + check_vowel_harmony + among('a' 'e') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nA as ( + check_vowel_harmony + among('na' 'ne') + ) + + define mark_DA as ( + check_vowel_harmony + among('da' 'de' 'ta' 'te') + ) + + define mark_ndA as ( + check_vowel_harmony + among('nda' 'nde') + ) + + define mark_DAn as ( + check_vowel_harmony + among('dan' 'den' 'tan' 'ten') + ) + + define mark_ndAn as ( + check_vowel_harmony + among('ndan' 'nden') + ) + + define mark_ylA as ( + check_vowel_harmony + among('la' 'le') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ki as ( + 'ki' + ) + + define mark_ncA as ( + check_vowel_harmony + among('ca' 'ce') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yUm as ( + check_vowel_harmony + among ('{i'}m' 'im' 'um' '{u"}m') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUn as ( + check_vowel_harmony + among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) + ) + + define mark_yUz as ( + check_vowel_harmony + among ('{i'}z' 'iz' 'uz' '{u"}z') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUnUz as ( + among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') + ) + + define mark_lAr as ( + check_vowel_harmony + among ('ler' 'lar') + ) + + define mark_nUz as ( + check_vowel_harmony + among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') + ) + + define mark_DUr as ( + check_vowel_harmony + among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') + ) + + define mark_cAsInA as ( + among ('cas{i'}na' 'cesine') + ) + + define mark_yDU as ( + check_vowel_harmony + among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' + 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' + 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' + 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') + (mark_suffix_with_optional_y_consonant) + ) + + // does not fully obey vowel harmony + define mark_ysA as ( + among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ymUs_ as ( + check_vowel_harmony + among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_yken as ( + 'ken' (mark_suffix_with_optional_y_consonant) + ) + + define stem_nominal_verb_suffixes as ( + [ + set continue_stemming_noun_suffixes + (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) + or + (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) + or + ( + mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) + unset continue_stemming_noun_suffixes + ) + or + (mark_nUz (mark_yDU or mark_ysA)) + or + ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) + or + (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) + ]delete + ) + + // stems noun suffix chains ending with -ki + define stem_suffix_chain_before_ki as ( + [ + mark_ki + ( + (mark_DA] delete try([ + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + + )) + or + (mark_nUn] delete try([ + (mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + or + (mark_ndA ( + (mark_lArI] delete) + or + ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) + or + (stem_suffix_chain_before_ki) + )) + ) + ) + + define stem_noun_suffixes as ( + ([mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + ([mark_ncA] delete + try( + ([mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + ([mark_lAr] delete stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndA or mark_nA) + ( + (mark_lArI] delete) + or + (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) + or + ( [mark_DAn] delete try ([ + ( + (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + ) + or + ([mark_nUn or mark_ylA] delete + try( + ([mark_lAr] delete stem_suffix_chain_before_ki) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + stem_suffix_chain_before_ki + ) + ) + or + ([mark_lArI] delete) + or + (stem_suffix_chain_before_ki) + or + ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + ) + + define post_process_last_consonants as ( + [substring] among ( + 'b' (<- 'p') + 'c' (<- '{c,}') + 'd' (<- 't') + '{g~}' (<- 'k') + ) + ) + + // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed + // like in 'kedim' -> 'ked' + // Turkish words don't usually end with 'd' or 'g' + // some very well known words are ignored (like 'ad' 'soyad' + // appends U to stems ending with d or g, decides which vowel to add + // based on the last vowel in the stem + define append_U_to_stems_ending_with_d_or_g as ( + test('d' or 'g') + (test((goto vowel) 'a' or '{i'}') <+ '{i'}') + or + (test((goto vowel) 'e' or 'i') <+ 'i') + or + (test((goto vowel) 'o' or 'u') <+ 'u') + or + (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') + ) + + define is_reserved_word as ( + 'ad' try 'soy' atlimit + ) +) + +// Tests if there are more than one syllables +// In Turkish each vowel indicates a distinct syllable +define more_than_one_syllable_word as ( + test (atleast 2 (gopast vowel)) +) + +define postlude as ( + backwards ( + not(is_reserved_word) + do append_U_to_stems_ending_with_d_or_g + do post_process_last_consonants + + ) +) + +define stem as ( + (more_than_one_syllable_word) + ( + backwards ( + do stem_nominal_verb_suffixes + continue_stemming_noun_suffixes + do stem_noun_suffixes + ) + + postlude + ) +) diff --git a/contrib/snowball/charsets/ISO-8859-2.sbl b/contrib/snowball/charsets/ISO-8859-2.sbl new file mode 100644 index 0000000..5829ea8 --- /dev/null +++ b/contrib/snowball/charsets/ISO-8859-2.sbl @@ -0,0 +1,98 @@ +// ISO-8859-2 character mappings. + +stringdef U+00A0 hex 'A0' +stringdef U+0104 hex 'A1' +stringdef U+02D8 hex 'A2' +stringdef U+0141 hex 'A3' +stringdef U+00A4 hex 'A4' +stringdef U+013D hex 'A5' +stringdef U+015A hex 'A6' +stringdef U+00A7 hex 'A7' +stringdef U+00A8 hex 'A8' +stringdef U+0160 hex 'A9' +stringdef U+015E hex 'AA' +stringdef U+0164 hex 'AB' +stringdef U+0179 hex 'AC' +stringdef U+00AD hex 'AD' +stringdef U+017D hex 'AE' +stringdef U+017B hex 'AF' +stringdef U+00B0 hex 'B0' +stringdef U+0105 hex 'B1' +stringdef U+02DB hex 'B2' +stringdef U+0142 hex 'B3' +stringdef U+00B4 hex 'B4' +stringdef U+013E hex 'B5' +stringdef U+015B hex 'B6' +stringdef U+02C7 hex 'B7' +stringdef U+00B8 hex 'B8' +stringdef U+0161 hex 'B9' +stringdef U+015F hex 'BA' +stringdef U+0165 hex 'BB' +stringdef U+017A hex 'BC' +stringdef U+02DD hex 'BD' +stringdef U+017E hex 'BE' +stringdef U+017C hex 'BF' +stringdef U+0154 hex 'C0' +stringdef U+00C1 hex 'C1' +stringdef U+00C2 hex 'C2' +stringdef U+0102 hex 'C3' +stringdef U+00C4 hex 'C4' +stringdef U+0139 hex 'C5' +stringdef U+0106 hex 'C6' +stringdef U+00C7 hex 'C7' +stringdef U+010C hex 'C8' +stringdef U+00C9 hex 'C9' +stringdef U+0118 hex 'CA' +stringdef U+00CB hex 'CB' +stringdef U+011A hex 'CC' +stringdef U+00CD hex 'CD' +stringdef U+00CE hex 'CE' +stringdef U+010E hex 'CF' +stringdef U+0110 hex 'D0' +stringdef U+0143 hex 'D1' +stringdef U+0147 hex 'D2' +stringdef U+00D3 hex 'D3' +stringdef U+00D4 hex 'D4' +stringdef U+0150 hex 'D5' +stringdef U+00D6 hex 'D6' +stringdef U+00D7 hex 'D7' +stringdef U+0158 hex 'D8' +stringdef U+016E hex 'D9' +stringdef U+00DA hex 'DA' +stringdef U+0170 hex 'DB' +stringdef U+00DC hex 'DC' +stringdef U+00DD hex 'DD' +stringdef U+0162 hex 'DE' +stringdef U+00DF hex 'DF' +stringdef U+0155 hex 'E0' +stringdef U+00E1 hex 'E1' +stringdef U+00E2 hex 'E2' +stringdef U+0103 hex 'E3' +stringdef U+00E4 hex 'E4' +stringdef U+013A hex 'E5' +stringdef U+0107 hex 'E6' +stringdef U+00E7 hex 'E7' +stringdef U+010D hex 'E8' +stringdef U+00E9 hex 'E9' +stringdef U+0119 hex 'EA' +stringdef U+00EB hex 'EB' +stringdef U+011B hex 'EC' +stringdef U+00ED hex 'ED' +stringdef U+00EE hex 'EE' +stringdef U+010F hex 'EF' +stringdef U+0111 hex 'F0' +stringdef U+0144 hex 'F1' +stringdef U+0148 hex 'F2' +stringdef U+00F3 hex 'F3' +stringdef U+00F4 hex 'F4' +stringdef U+0151 hex 'F5' +stringdef U+00F6 hex 'F6' +stringdef U+00F7 hex 'F7' +stringdef U+0159 hex 'F8' +stringdef U+016F hex 'F9' +stringdef U+00FA hex 'FA' +stringdef U+0171 hex 'FB' +stringdef U+00FC hex 'FC' +stringdef U+00FD hex 'FD' +stringdef U+0163 hex 'FE' +stringdef U+02D9 hex 'FF' diff --git a/contrib/snowball/charsets/KOI8-R.sbl b/contrib/snowball/charsets/KOI8-R.sbl new file mode 100644 index 0000000..46854e8 --- /dev/null +++ b/contrib/snowball/charsets/KOI8-R.sbl @@ -0,0 +1,74 @@ +// KOI8-R character mappings. + +stringdef U+00A0 hex '9A' +stringdef U+00A9 hex 'BF' +stringdef U+00B0 hex '9C' +stringdef U+00B2 hex '9D' +stringdef U+00B7 hex '9E' +stringdef U+00F7 hex '9F' +stringdef U+0401 hex 'B3' +stringdef U+0410 hex 'E1' +stringdef U+0411 hex 'E2' +stringdef U+0412 hex 'F7' +stringdef U+0413 hex 'E7' +stringdef U+0414 hex 'E4' +stringdef U+0415 hex 'E5' +stringdef U+0416 hex 'F6' +stringdef U+0417 hex 'FA' +stringdef U+0418 hex 'E9' +stringdef U+0419 hex 'EA' +stringdef U+041A hex 'EB' +stringdef U+041B hex 'EC' +stringdef U+041C hex 'ED' +stringdef U+041D hex 'EE' +stringdef U+041E hex 'EF' +stringdef U+041F hex 'F0' +stringdef U+0420 hex 'F2' +stringdef U+0421 hex 'F3' +stringdef U+0422 hex 'F4' +stringdef U+0423 hex 'F5' +stringdef U+0424 hex 'E6' +stringdef U+0425 hex 'E8' +stringdef U+0426 hex 'E3' +stringdef U+0427 hex 'FE' +stringdef U+0428 hex 'FB' +stringdef U+0429 hex 'FD' +stringdef U+042A hex 'FF' +stringdef U+042B hex 'F9' +stringdef U+042C hex 'F8' +stringdef U+042D hex 'FC' +stringdef U+042E hex 'E0' +stringdef U+042F hex 'F1' +stringdef U+0430 hex 'C1' +stringdef U+0431 hex 'C2' +stringdef U+0432 hex 'D7' +stringdef U+0433 hex 'C7' +stringdef U+0434 hex 'C4' +stringdef U+0435 hex 'C5' +stringdef U+0436 hex 'D6' +stringdef U+0437 hex 'DA' +stringdef U+0438 hex 'C9' +stringdef U+0439 hex 'CA' +stringdef U+043A hex 'CB' +stringdef U+043B hex 'CC' +stringdef U+043C hex 'CD' +stringdef U+043D hex 'CE' +stringdef U+043E hex 'CF' +stringdef U+043F hex 'D0' +stringdef U+0440 hex 'D2' +stringdef U+0441 hex 'D3' +stringdef U+0442 hex 'D4' +stringdef U+0443 hex 'D5' +stringdef U+0444 hex 'C6' +stringdef U+0445 hex 'C8' +stringdef U+0446 hex 'C3' +stringdef U+0447 hex 'DE' +stringdef U+0448 hex 'DB' +stringdef U+0449 hex 'DD' +stringdef U+044A hex 'DF' +stringdef U+044B hex 'D9' +stringdef U+044C hex 'D8' +stringdef U+044D hex 'DC' +stringdef U+044E hex 'C0' +stringdef U+044F hex 'D1' +stringdef U+0451 hex 'A3' diff --git a/contrib/snowball/charsets/cp850.sbl b/contrib/snowball/charsets/cp850.sbl new file mode 100644 index 0000000..b780220 --- /dev/null +++ b/contrib/snowball/charsets/cp850.sbl @@ -0,0 +1,130 @@ +// Code page 850 (MSDOS Latin 1) character mappings. + +stringdef U+00A0 hex 'FF' +stringdef U+00A1 hex 'AD' +stringdef U+00A2 hex 'BD' +stringdef U+00A3 hex '9C' +stringdef U+00A4 hex 'CF' +stringdef U+00A5 hex 'BE' +stringdef U+00A6 hex 'DD' +stringdef U+00A7 hex 'F5' +stringdef U+00A8 hex 'F9' +stringdef U+00A9 hex 'B8' +stringdef U+00AA hex 'A6' +stringdef U+00AB hex 'AE' +stringdef U+00AC hex 'AA' +stringdef U+00AD hex 'F0' +stringdef U+00AE hex 'A9' +stringdef U+00AF hex 'EE' +stringdef U+00B0 hex 'F8' +stringdef U+00B1 hex 'F1' +stringdef U+00B2 hex 'FD' +stringdef U+00B3 hex 'FC' +stringdef U+00B4 hex 'EF' +stringdef U+00B5 hex 'E6' +stringdef U+00B6 hex 'F4' +stringdef U+00B7 hex 'FA' +stringdef U+00B8 hex 'F7' +stringdef U+00B9 hex 'FB' +stringdef U+00BA hex 'A7' +stringdef U+00BB hex 'AF' +stringdef U+00BC hex 'AC' +stringdef U+00BD hex 'AB' +stringdef U+00BE hex 'F3' +stringdef U+00BF hex 'A8' +stringdef U+00C0 hex 'B7' +stringdef U+00C1 hex 'B5' +stringdef U+00C2 hex 'B6' +stringdef U+00C3 hex 'C7' +stringdef U+00C4 hex '8E' +stringdef U+00C5 hex '8F' +stringdef U+00C6 hex '92' +stringdef U+00C7 hex '80' +stringdef U+00C8 hex 'D4' +stringdef U+00C9 hex '90' +stringdef U+00CA hex 'D2' +stringdef U+00CB hex 'D3' +stringdef U+00CC hex 'DE' +stringdef U+00CD hex 'D6' +stringdef U+00CE hex 'D7' +stringdef U+00CF hex 'D8' +stringdef U+00D0 hex 'D1' +stringdef U+00D1 hex 'A5' +stringdef U+00D2 hex 'E3' +stringdef U+00D3 hex 'E0' +stringdef U+00D4 hex 'E2' +stringdef U+00D5 hex 'E5' +stringdef U+00D6 hex '99' +stringdef U+00D7 hex '9E' +stringdef U+00D8 hex '9D' +stringdef U+00D9 hex 'EB' +stringdef U+00DA hex 'E9' +stringdef U+00DB hex 'EA' +stringdef U+00DC hex '9A' +stringdef U+00DD hex 'ED' +stringdef U+00DE hex 'E8' +stringdef U+00DF hex 'E1' +stringdef U+00E0 hex '85' +stringdef U+00E1 hex 'A0' +stringdef U+00E2 hex '83' +stringdef U+00E3 hex 'C6' +stringdef U+00E4 hex '84' +stringdef U+00E5 hex '86' +stringdef U+00E6 hex '91' +stringdef U+00E7 hex '87' +stringdef U+00E8 hex '8A' +stringdef U+00E9 hex '82' +stringdef U+00EA hex '88' +stringdef U+00EB hex '89' +stringdef U+00EC hex '8D' +stringdef U+00ED hex 'A1' +stringdef U+00EE hex '8C' +stringdef U+00EF hex '8B' +stringdef U+00F0 hex 'D0' +stringdef U+00F1 hex 'A4' +stringdef U+00F2 hex '95' +stringdef U+00F3 hex 'A2' +stringdef U+00F4 hex '93' +stringdef U+00F5 hex 'E4' +stringdef U+00F6 hex '94' +stringdef U+00F7 hex 'F6' +stringdef U+00F8 hex '9B' +stringdef U+00F9 hex '97' +stringdef U+00FA hex 'A3' +stringdef U+00FB hex '96' +stringdef U+00FC hex '81' +stringdef U+00FD hex 'EC' +stringdef U+00FE hex 'E7' +stringdef U+00FF hex '98' +stringdef U+0131 hex 'D5' +stringdef U+0192 hex '9F' +stringdef U+2017 hex 'F2' +stringdef U+2500 hex 'C4' +stringdef U+2502 hex 'B3' +stringdef U+250C hex 'DA' +stringdef U+2510 hex 'BF' +stringdef U+2514 hex 'C0' +stringdef U+2518 hex 'D9' +stringdef U+251C hex 'C3' +stringdef U+2524 hex 'B4' +stringdef U+252C hex 'C2' +stringdef U+2534 hex 'C1' +stringdef U+253C hex 'C5' +stringdef U+2550 hex 'CD' +stringdef U+2551 hex 'BA' +stringdef U+2554 hex 'C9' +stringdef U+2557 hex 'BB' +stringdef U+255A hex 'C8' +stringdef U+255D hex 'BC' +stringdef U+2560 hex 'CC' +stringdef U+2563 hex 'B9' +stringdef U+2566 hex 'CB' +stringdef U+2569 hex 'CA' +stringdef U+256C hex 'CE' +stringdef U+2580 hex 'DF' +stringdef U+2584 hex 'DC' +stringdef U+2588 hex 'DB' +stringdef U+2591 hex 'B0' +stringdef U+2592 hex 'B1' +stringdef U+2593 hex 'B2' +stringdef U+25A0 hex 'FE' diff --git a/contrib/snowball/compiler/analyser.c b/contrib/snowball/compiler/analyser.c new file mode 100644 index 0000000..dffa555 --- /dev/null +++ b/contrib/snowball/compiler/analyser.c @@ -0,0 +1,1380 @@ + +#include <stdio.h> /* printf etc */ +#include <stdlib.h> /* exit */ +#include <string.h> /* memmove */ +#include "header.h" + +typedef enum { + e_token_omitted = 0, + e_unexpected_token = 1, + e_string_omitted = 2, + e_unexpected_token_in_among = 3, + /* For codes above here, report "after " t->previous_token after the error. */ + e_unresolved_substring = 14, + e_not_allowed_inside_reverse = 15, + e_empty_grouping = 16, + e_already_backwards = 17, + e_empty_among = 18, + e_adjacent_bracketed_in_among = 19, + e_substring_preceded_by_substring = 20, + /* For codes below here, tokeniser->b is printed before the error. */ + e_redeclared = 30, + e_undeclared = 31, + e_declared_as_different_mode = 32, + e_not_of_type_x = 33, + e_not_of_type_string_or_integer = 34, + e_misplaced = 35, + e_redefined = 36, + e_misused = 37 +} error_code; + +/* recursive usage: */ + +static void read_program_(struct analyser * a, int terminator); +static struct node * read_C(struct analyser * a); +static struct node * C_style(struct analyser * a, const char * s, int token); + + +static void print_node_(struct node * p, int n, const char * s) { + + int i; + for (i = 0; i < n; i++) fputs(i == n - 1 ? s : " ", stdout); + printf("%s ", name_of_token(p->type)); + if (p->name) report_b(stdout, p->name->b); + if (p->literalstring) { + printf("'"); + report_b(stdout, p->literalstring); + printf("'"); + } + printf("\n"); + if (p->AE) print_node_(p->AE, n+1, "# "); + if (p->left) print_node_(p->left, n+1, " "); + if (p->aux) print_node_(p->aux, n+1, "@ "); + if (p->right) print_node_(p->right, n, " "); +} + +extern void print_program(struct analyser * a) { + print_node_(a->program, 0, " "); +} + +static struct node * new_node(struct analyser * a, int type) { + NEW(node, p); + p->next = a->nodes; a->nodes = p; + p->left = 0; + p->right = 0; + p->aux = 0; + p->AE = 0; + p->name = 0; + p->literalstring = 0; + p->mode = a->mode; + p->line_number = a->tokeniser->line_number; + p->type = type; + return p; +} + +static const char * name_of_mode(int n) { + switch (n) { + case m_backward: return "string backward"; + case m_forward: return "string forward"; + /* case m_integer: return "integer"; */ + } + fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n); + exit(1); +} + +static const char * name_of_type(int n) { + switch (n) { + case 's': return "string"; + case 'i': return "integer"; + case 'r': return "routine"; + case 'R': return "routine or grouping"; + case 'g': return "grouping"; + } + fprintf(stderr, "Invalid type %d in name_of_type()\n", n); + exit(1); +} + +static const char * name_of_name_type(int code) { + switch (code) { + case t_string: return "string"; + case t_boolean: return "boolean"; + case t_integer: return "integer"; + case t_routine: return "routine"; + case t_external: return "external"; + case t_grouping: return "grouping"; + } + fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code); + exit(1); +} + +static void count_error(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); } + t->error_count++; +} + +static void error2(struct analyser * a, error_code n, int x) { + struct tokeniser * t = a->tokeniser; + count_error(a); + fprintf(stderr, "%s:%d: ", t->file, t->line_number); + if ((int)n >= (int)e_redeclared) report_b(stderr, t->b); + switch (n) { + case e_token_omitted: + fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; + case e_unexpected_token_in_among: + fprintf(stderr, "in among(...), "); + /* fall through */ + case e_unexpected_token: + fprintf(stderr, "unexpected %s", name_of_token(t->token)); + if (t->token == c_number) fprintf(stderr, " %d", t->number); + if (t->token == c_name) { + fprintf(stderr, " "); + report_b(stderr, t->b); + } break; + case e_string_omitted: + fprintf(stderr, "string omitted"); break; + + case e_unresolved_substring: + fprintf(stderr, "unresolved substring on line %d", x); break; + case e_not_allowed_inside_reverse: + fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break; + case e_empty_grouping: + fprintf(stderr, "empty grouping"); break; + case e_already_backwards: + fprintf(stderr, "backwards used when already in this mode"); break; + case e_empty_among: + fprintf(stderr, "empty among(...)"); break; + case e_adjacent_bracketed_in_among: + fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break; + case e_substring_preceded_by_substring: + fprintf(stderr, "substring preceded by another substring on line %d", x); break; + + case e_redeclared: + fprintf(stderr, " re-declared"); break; + case e_undeclared: + fprintf(stderr, " undeclared"); break; + case e_declared_as_different_mode: + fprintf(stderr, " declared as %s mode; used as %s mode", + name_of_mode(a->mode), name_of_mode(x)); break; + case e_not_of_type_x: + fprintf(stderr, " not of type %s", name_of_type(x)); break; + case e_not_of_type_string_or_integer: + fprintf(stderr, " not of type string or integer"); break; + case e_misplaced: + fprintf(stderr, " misplaced"); break; + case e_redefined: + fprintf(stderr, " redefined"); break; + case e_misused: + fprintf(stderr, " mis-used as %s mode", + name_of_mode(x)); break; + } + if ((int)n < (int)e_unresolved_substring && t->previous_token > 0) + fprintf(stderr, " after %s", name_of_token(t->previous_token)); + fprintf(stderr, "\n"); +} + +static void error(struct analyser * a, error_code n) { error2(a, n, 0); } + +static void error4(struct analyser * a, struct name * q) { + count_error(a); + fprintf(stderr, "%s:%d: ", a->tokeniser->file, q->used->line_number); + report_b(stderr, q->b); + fprintf(stderr, " undefined\n"); +} + +static void omission_error(struct analyser * a, int n) { + a->tokeniser->omission = n; + error(a, e_token_omitted); +} + +static int check_token(struct analyser * a, int code) { + struct tokeniser * t = a->tokeniser; + if (t->token != code) { omission_error(a, code); return false; } + return true; +} + +static int get_token(struct analyser * a, int code) { + struct tokeniser * t = a->tokeniser; + read_token(t); + { + int x = check_token(a, code); + if (!x) t->token_held = true; + return x; + } +} + +static struct name * look_for_name(struct analyser * a) { + symbol * q = a->tokeniser->b; + struct name * p; + for (p = a->names; p; p = p->next) { + symbol * b = p->b; + int n = SIZE(b); + if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) { + p->referenced = true; + return p; + } + } + return 0; +} + +static struct name * find_name(struct analyser * a) { + struct name * p = look_for_name(a); + if (p == 0) error(a, e_undeclared); + return p; +} + +static void check_routine_mode(struct analyser * a, struct name * p, int mode) { + if (p->mode < 0) p->mode = mode; else + if (p->mode != mode) error2(a, e_misused, mode); +} + +static void check_name_type(struct analyser * a, struct name * p, int type) { + switch (type) { + case 's': + if (p->type == t_string) return; + break; + case 'i': + if (p->type == t_integer) return; + break; + case 'b': + if (p->type == t_boolean) return; + break; + case 'R': + if (p->type == t_grouping) return; + /* FALLTHRU */ + case 'r': + if (p->type == t_routine || p->type == t_external) return; + break; + case 'g': + if (p->type == t_grouping) return; + break; + } + error2(a, e_not_of_type_x, type); +} + +static void read_names(struct analyser * a, int type) { + struct tokeniser * t = a->tokeniser; + if (!get_token(a, c_bra)) return; + while (true) { + int token = read_token(t); + switch (token) { + case c_len: { + /* Context-sensitive token - once declared as a name, it loses + * its special meaning, for compatibility with older versions + * of snowball. + */ + static const symbol c_len_lit[] = { + 'l', 'e', 'n' + }; + MOVE_TO_B(t->b, c_len_lit); + goto handle_as_name; + } + case c_lenof: { + /* Context-sensitive token - once declared as a name, it loses + * its special meaning, for compatibility with older versions + * of snowball. + */ + static const symbol c_lenof_lit[] = { + 'l', 'e', 'n', 'o', 'f' + }; + MOVE_TO_B(t->b, c_lenof_lit); + goto handle_as_name; + } + case c_name: +handle_as_name: + if (look_for_name(a) != 0) error(a, e_redeclared); else { + NEW(name, p); + p->b = copy_b(t->b); + p->type = type; + p->mode = -1; /* routines, externals */ + /* We defer assigning counts until after we've eliminated + * variables whose values are never used. */ + p->count = -1; + p->referenced = false; + p->used_in_among = false; + p->used = 0; + p->value_used = false; + p->initialised = false; + p->used_in_definition = false; + p->local_to = 0; + p->grouping = 0; + p->definition = 0; + p->declaration_line_number = t->line_number; + p->next = a->names; + a->names = p; + if (token != c_name) { + disable_token(t, token); + } + } + break; + default: + if (!check_token(a, c_ket)) t->token_held = true; + return; + } + } +} + +static symbol * new_literalstring(struct analyser * a) { + NEW(literalstring, p); + p->b = copy_b(a->tokeniser->b); + p->next = a->literalstrings; + a->literalstrings = p; + return p->b; +} + +static int read_AE_test(struct analyser * a) { + + struct tokeniser * t = a->tokeniser; + switch (read_token(t)) { + case c_assign: return c_mathassign; + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: return t->token; + default: error(a, e_unexpected_token); t->token_held = true; return c_eq; + } +} + +static int binding(int t) { + switch (t) { + case c_plus: case c_minus: return 1; + case c_multiply: case c_divide: return 2; + default: return -2; + } +} + +static void mark_used_in(struct analyser * a, struct name * q, struct node * p) { + if (!q->used) { + q->used = p; + q->local_to = a->program_end->name; + } else if (q->local_to) { + if (q->local_to != a->program_end->name) { + /* Used in more than one routine/external. */ + q->local_to = NULL; + } + } +} + +static void name_to_node(struct analyser * a, struct node * p, int type) { + struct name * q = find_name(a); + if (q) { + check_name_type(a, q, type); + mark_used_in(a, q, p); + } + p->name = q; +} + +static struct node * read_AE(struct analyser * a, int B) { + struct tokeniser * t = a->tokeniser; + struct node * p; + struct node * q; + switch (read_token(t)) { + case c_minus: /* monadic */ + q = read_AE(a, 100); + if (q->type == c_neg) { + /* Optimise away double negation, which avoids generators + * having to worry about generating "--" (decrement operator + * in many languages). + */ + p = q->right; + /* Don't free q, it's in the linked list a->nodes. */ + break; + } + p = new_node(a, c_neg); + p->right = q; + break; + case c_bra: + p = read_AE(a, 0); + get_token(a, c_ket); + break; + case c_name: + p = new_node(a, c_name); + name_to_node(a, p, 'i'); + if (p->name) p->name->value_used = true; + break; + case c_maxint: + case c_minint: + a->int_limits_used = true; + /* fall through */ + case c_cursor: + case c_limit: + case c_len: + case c_size: + p = new_node(a, t->token); + break; + case c_number: + p = new_node(a, c_number); + p->number = t->number; + break; + case c_lenof: + case c_sizeof: + p = C_style(a, "s", t->token); + break; + default: + error(a, e_unexpected_token); + t->token_held = true; + return 0; + } + while (true) { + int token = read_token(t); + int b = binding(token); + if (binding(token) <= B) { + t->token_held = true; + return p; + } + q = new_node(a, token); + q->left = p; + q->right = read_AE(a, b); + p = q; + } +} + +static struct node * read_C_connection(struct analyser * a, struct node * q, int op) { + struct tokeniser * t = a->tokeniser; + struct node * p = new_node(a, op); + struct node * p_end = q; + p->left = q; + do { + q = read_C(a); + p_end->right = q; p_end = q; + } while (read_token(t) == op); + t->token_held = true; + return p; +} + +static struct node * read_C_list(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + struct node * p = new_node(a, c_bra); + struct node * p_end = 0; + while (true) { + int token = read_token(t); + if (token == c_ket) return p; + if (token < 0) { omission_error(a, c_ket); return p; } + t->token_held = true; + { + struct node * q = read_C(a); + while (true) { + token = read_token(t); + if (token != c_and && token != c_or) { + t->token_held = true; + break; + } + q = read_C_connection(a, q, token); + } + if (p_end == 0) p->left = q; else p_end->right = q; + p_end = q; + } + } +} + +static struct node * C_style(struct analyser * a, const char * s, int token) { + int i; + struct node * p = new_node(a, token); + for (i = 0; s[i] != 0; i++) switch (s[i]) { + case 'C': + p->left = read_C(a); continue; + case 'D': + p->aux = read_C(a); continue; + case 'A': + p->AE = read_AE(a, 0); continue; + case 'f': + get_token(a, c_for); continue; + case 'S': + { + int str_token = read_token(a->tokeniser); + if (str_token == c_name) name_to_node(a, p, 's'); else + if (str_token == c_literalstring) p->literalstring = new_literalstring(a); + else error(a, e_string_omitted); + } + continue; + case 'b': + case 's': + case 'i': + if (get_token(a, c_name)) name_to_node(a, p, s[i]); + continue; + } + return p; +} + +static struct node * read_literalstring(struct analyser * a) { + struct node * p = new_node(a, c_literalstring); + p->literalstring = new_literalstring(a); + return p; +} + +static void reverse_b(symbol * b) { + int i = 0; int j = SIZE(b) - 1; + while (i < j) { + int ch1 = b[i]; int ch2 = b[j]; + b[i++] = ch2; b[j--] = ch1; + } +} + +static int compare_amongvec(const void *pv, const void *qv) { + const struct amongvec * p = (const struct amongvec*)pv; + const struct amongvec * q = (const struct amongvec*)qv; + symbol * b_p = p->b; int p_size = p->size; + symbol * b_q = q->b; int q_size = q->size; + int smaller_size = p_size < q_size ? p_size : q_size; + int i; + for (i = 0; i < smaller_size; i++) + if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; + if (p_size - q_size) + return p_size - q_size; + return p->line_number - q->line_number; +} + +#define PTR_NULL_CHECK(P, Q) do {\ + if ((Q) == NULL) {\ + if ((P) != NULL) return 1;\ + } else {\ + if ((P) == NULL) return -1;\ + }\ + } while (0) + +static int compare_node(const struct node *p, const struct node *q) { + PTR_NULL_CHECK(p, q); + if (q == NULL) { + /* p must be NULL too. */ + return 0; + } + + if (p->type != q->type) return p->type > q->type ? 1 : -1; + if (p->mode != q->mode) return p->mode > q->mode ? 1 : -1; + if (p->type == c_number) { + if (p->number != q->number) + return p->number > q->number ? 1 : -1; + } + + PTR_NULL_CHECK(p->left, q->left); + if (p->left) { + int r = compare_node(p->left, q->left); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->AE, q->AE); + if (p->AE) { + int r = compare_node(p->AE, q->AE); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->aux, q->aux); + if (p->aux) { + int r = compare_node(p->aux, q->aux); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->name, q->name); + if (p->name) { + int r; + if (SIZE(p->name->b) != SIZE(q->name->b)) { + return SIZE(p->name->b) - SIZE(q->name->b); + } + r = memcmp(p->name->b, q->name->b, + SIZE(p->name->b) * sizeof(symbol)); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->literalstring, q->literalstring); + if (p->literalstring) { + int r; + if (SIZE(p->literalstring) != SIZE(q->literalstring)) { + return SIZE(p->literalstring) - SIZE(q->literalstring); + } + r = memcmp(p->literalstring, q->literalstring, + SIZE(p->literalstring) * sizeof(symbol)); + if (r != 0) return r; + } + + return compare_node(p->right, q->right); +} + +static void make_among(struct analyser * a, struct node * p, struct node * substring) { + + NEW(among, x); + NEWVEC(amongvec, v, p->number); + struct node * q = p->left; + struct amongvec * w0 = v; + struct amongvec * w1 = v; + int result = 1; + + int direction = substring != 0 ? substring->mode : p->mode; + int backward = direction == m_backward; + + if (a->amongs == 0) a->amongs = x; else a->amongs_end->next = x; + a->amongs_end = x; + x->next = 0; + x->b = v; + x->number = a->among_count++; + x->function_count = 0; + x->starter = 0; + x->nocommand_count = 0; + x->amongvar_needed = false; + + if (q->type == c_bra) { x->starter = q; q = q->right; } + + while (q) { + if (q->type == c_literalstring) { + symbol * b = q->literalstring; + w1->b = b; /* pointer to case string */ + w1->action = NULL; /* action gets filled in below */ + w1->line_number = q->line_number; + w1->size = SIZE(b); /* number of characters in string */ + w1->i = -1; /* index of longest substring */ + w1->result = -1; /* number of corresponding case expression */ + if (q->left) { + struct name * function = q->left->name; + w1->function = function; + function->used_in_among = true; + check_routine_mode(a, function, direction); + x->function_count++; + } else { + w1->function = 0; + } + w1++; + } else if (q->left == 0) { + /* empty command: () */ + w0 = w1; + } else { + /* Check for previous action which is the same as this one and use + * the same action code if we find one. + */ + int among_result = -1; + struct amongvec * w; + for (w = v; w < w0; ++w) { + if (w->action && compare_node(w->action->left, q->left) == 0) { + if (w->result <= 0) { + printf("Among code %d isn't positive\n", w->result); + exit(1); + } + among_result = w->result; + break; + } + } + if (among_result < 0) { + among_result = result++; + } + + while (w0 != w1) { + w0->action = q; + w0->result = among_result; + w0++; + } + } + q = q->right; + } + if (w1-v != p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } + x->command_count = result - 1; + { + NEWVEC(node*, commands, x->command_count); + memset(commands, 0, x->command_count * sizeof(struct node*)); + for (w0 = v; w0 < w1; w0++) { + if (w0->result > 0) { + /* result == -1 when there's no command. */ + if (w0->result > x->command_count) { + fprintf(stderr, "More among codes than expected\n"); + exit(1); + } + if (!commands[w0->result - 1]) + commands[w0->result - 1] = w0->action; + } else { + ++x->nocommand_count; + } + if (backward) reverse_b(w0->b); + } + x->commands = commands; + } + qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); + + /* the following loop is O(n squared) */ + for (w0 = w1 - 1; w0 >= v; w0--) { + symbol * b = w0->b; + int size = w0->size; + struct amongvec * w; + + for (w = w0 - 1; w >= v; w--) { + if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) { + w0->i = w - v; /* fill in index of longest substring */ + break; + } + } + } + if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); + + for (w0 = v; w0 < w1 - 1; w0++) + if (w0->size == (w0 + 1)->size && + memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) { + count_error(a); + fprintf(stderr, "%s:%d: among(...) has repeated string '", + a->tokeniser->file, (w0 + 1)->line_number); + report_b(stderr, (w0 + 1)->b); + fprintf(stderr, "'\n"); + count_error(a); + fprintf(stderr, "%s:%d: previously seen here\n", + a->tokeniser->file, w0->line_number); + } + + x->literalstring_count = p->number; + p->among = x; + + x->substring = substring; + if (substring != 0) substring->among = x; + if (x->command_count > 1 || + (x->command_count == 1 && x->nocommand_count > 0) || + x->starter != 0) { + /* We need to set among_var rather than just checking if find_among*() + * returns zero or not. + */ + x->amongvar_needed = a->amongvar_needed = true; + } +} + +static struct node * read_among(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + struct node * p = new_node(a, c_among); + struct node * p_end = 0; + int previous_token = -1; + struct node * substring = a->substring; + + a->substring = 0; + p->number = 0; /* counts the number of literals */ + if (!get_token(a, c_bra)) return p; + while (true) { + struct node * q; + int token = read_token(t); + switch (token) { + case c_literalstring: + q = read_literalstring(a); + if (read_token(t) == c_name) { + struct node * r = new_node(a, c_name); + name_to_node(a, r, 'r'); + q->left = r; + } + else t->token_held = true; + p->number++; break; + case c_bra: + if (previous_token == c_bra) error(a, e_adjacent_bracketed_in_among); + q = read_C_list(a); break; + default: + error(a, e_unexpected_token_in_among); + previous_token = token; + continue; + case c_ket: + if (p->number == 0) error(a, e_empty_among); + if (t->error_count == 0) make_among(a, p, substring); + return p; + } + previous_token = token; + if (p_end == 0) p->left = q; else p_end->right = q; + p_end = q; + } +} + +static struct node * read_substring(struct analyser * a) { + + struct node * p = new_node(a, c_substring); + if (a->substring != 0) error2(a, e_substring_preceded_by_substring, a->substring->line_number); + a->substring = p; + return p; +} + +static void check_modifyable(struct analyser * a) { + if (!a->modifyable) error(a, e_not_allowed_inside_reverse); +} + +static struct node * read_C(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + int token = read_token(t); + switch (token) { + case c_bra: + return read_C_list(a); + case c_backwards: + { + int mode = a->mode; + if (a->mode == m_backward) error(a, e_already_backwards); else a->mode = m_backward; + { struct node * p = C_style(a, "C", token); + a->mode = mode; + return p; + } + } + case c_reverse: + { + int mode = a->mode; + int modifyable = a->modifyable; + a->modifyable = false; + a->mode = mode == m_forward ? m_backward : m_forward; + { + struct node * p = C_style(a, "C", token); + a->mode = mode; + a->modifyable = modifyable; + return p; + } + } + case c_not: + case c_try: + case c_fail: + case c_test: + case c_do: + case c_goto: + case c_gopast: + case c_repeat: + return C_style(a, "C", token); + case c_loop: + case c_atleast: + return C_style(a, "AC", token); + case c_setmark: { + struct node * n = C_style(a, "i", token); + if (n->name) n->name->initialised = true; + return n; + } + case c_tomark: + case c_atmark: + case c_hop: + return C_style(a, "A", token); + case c_delete: + check_modifyable(a); + /* fall through */ + case c_next: + case c_tolimit: + case c_atlimit: + case c_leftslice: + case c_rightslice: + case c_true: + case c_false: + case c_debug: + return new_node(a, token); + case c_assignto: + case c_sliceto: { + struct node *n; + check_modifyable(a); + n = C_style(a, "s", token); + if (n->name) n->name->initialised = true; + return n; + } + case c_assign: + case c_insert: + case c_attach: + case c_slicefrom: { + struct node *n; + check_modifyable(a); + n = C_style(a, "S", token); + if (n->name) n->name->value_used = true; + return n; + } + case c_setlimit: + return C_style(a, "CfD", token); + case c_set: + case c_unset: { + struct node * n = C_style(a, "b", token); + if (n->name) n->name->initialised = true; + return n; + } + case c_dollar: { + struct tokeniser * t = a->tokeniser; + read_token(t); + if (t->token == c_bra) { + /* Handle newer $(AE REL_OP AE) syntax. */ + struct node * n = read_AE(a, 0); + read_token(t); + switch (t->token) { + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: { + struct node * lhs = n; + n = new_node(a, t->token); + n->left = lhs; + n->AE = read_AE(a, 0); + get_token(a, c_ket); + break; + } + default: + error(a, e_unexpected_token); + t->token_held = true; + break; + } + return n; + } + + if (t->token == c_name) { + struct node * p; + struct name * q = find_name(a); + int mode = a->mode; + int modifyable = a->modifyable; + if (q && q->type == t_string) { + /* Assume for now that $ on string both initialises and + * uses the string variable. FIXME: Can we do better? + */ + q->initialised = true; + q->value_used = true; + a->mode = m_forward; + a->modifyable = true; + p = new_node(a, c_dollar); + p->left = read_C(a); + p->name = q; + } else { + if (q && q->type != t_integer) { + /* If $ is used on an unknown name or a name which + * isn't a string or an integer then we assume the + * unknown name is an integer as $ is used more often + * on integers than strings, so hopefully this it less + * likely to cause an error avalanche. + * + * For an unknown name, we'll already have reported an + * error. + */ + error(a, e_not_of_type_string_or_integer); + q = NULL; + } + p = new_node(a, read_AE_test(a)); + p->AE = read_AE(a, 0); + + if (q) { + switch (p->type) { + case c_mathassign: + q->initialised = true; + p->name = q; + break; + default: + /* +=, etc don't "initialise" as they only + * amend an existing value. Similarly, they + * don't count as using the value. + */ + p->name = q; + break; + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + p->left = new_node(a, c_name); + p->left->name = q; + q->value_used = true; + break; + } + } + } + if (q) mark_used_in(a, q, p); + a->mode = mode; + a->modifyable = modifyable; + return p; + } + + error(a, e_unexpected_token); + t->token_held = true; + return new_node(a, c_dollar); + } + case c_name: + { + struct name * q = find_name(a); + struct node * p = new_node(a, c_name); + if (q) { + mark_used_in(a, q, p); + switch (q->type) { + case t_boolean: + p->type = c_booltest; + q->value_used = true; + break; + case t_integer: + error(a, e_misplaced); /* integer name misplaced */ + break; + case t_string: + q->value_used = true; + break; + case t_routine: + case t_external: + p->type = c_call; + check_routine_mode(a, q, a->mode); + break; + case t_grouping: + p->type = c_grouping; break; + } + } + p->name = q; + return p; + } + case c_non: + { + struct node * p = new_node(a, token); + read_token(t); + if (t->token == c_minus) read_token(t); + if (!check_token(a, c_name)) { omission_error(a, c_name); return p; } + name_to_node(a, p, 'g'); + return p; + } + case c_literalstring: + return read_literalstring(a); + case c_among: return read_among(a); + case c_substring: return read_substring(a); + default: error(a, e_unexpected_token); return 0; + } +} + +static int next_symbol(symbol * p, symbol * W, int utf8) { + if (utf8) { + int ch; + int j = get_utf8(p, & ch); + W[0] = ch; return j; + } else { + W[0] = p[0]; return 1; + } +} + +static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) { + int j = 0; + symbol W[1]; + int width; + if (style == c_plus) { + while (j < SIZE(q)) { + width = next_symbol(q + j, W, utf8); + p = add_to_b(p, 1, W); + j += width; + } + } else { + while (j < SIZE(q)) { + int i; + width = next_symbol(q + j, W, utf8); + for (i = 0; i < SIZE(p); i++) { + if (p[i] == W[0]) { + memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol)); + SIZE(p)--; + } + } + j += width; + } + } + return p; +} + +static void read_define_grouping(struct analyser * a, struct name * q) { + struct tokeniser * t = a->tokeniser; + int style = c_plus; + { + NEW(grouping, p); + if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p; + a->groupings_end = p; + if (q) q->grouping = p; + p->next = 0; + p->name = q; + p->line_number = a->tokeniser->line_number; + p->b = create_b(0); + while (true) { + switch (read_token(t)) { + case c_name: + { + struct name * r = find_name(a); + if (r) { + check_name_type(a, r, 'g'); + p->b = alter_grouping(p->b, r->grouping->b, style, false); + r->used_in_definition = true; + } + } + break; + case c_literalstring: + p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8)); + break; + default: error(a, e_unexpected_token); return; + } + switch (read_token(t)) { + case c_plus: + case c_minus: style = t->token; break; + default: goto label0; + } + } + label0: + { + int i; + int max = 0; + int min = 1<<16; + for (i = 0; i < SIZE(p->b); i++) { + if (p->b[i] > max) max = p->b[i]; + if (p->b[i] < min) min = p->b[i]; + } + p->largest_ch = max; + p->smallest_ch = min; + if (min == 1<<16) error(a, e_empty_grouping); + } + t->token_held = true; return; + } +} + +static void read_define_routine(struct analyser * a, struct name * q) { + struct node * p = new_node(a, c_define); + a->amongvar_needed = false; + if (q) { + check_name_type(a, q, 'R'); + if (q->definition != 0) error(a, e_redefined); + if (q->mode < 0) q->mode = a->mode; else + if (q->mode != a->mode) error2(a, e_declared_as_different_mode, q->mode); + } + p->name = q; + if (a->program == 0) a->program = p; else a->program_end->right = p; + a->program_end = p; + get_token(a, c_as); + p->left = read_C(a); + if (q) q->definition = p->left; + + if (a->substring != 0) { + error2(a, e_unresolved_substring, a->substring->line_number); + a->substring = 0; + } + p->amongvar_needed = a->amongvar_needed; +} + +static void read_define(struct analyser * a) { + if (get_token(a, c_name)) { + struct name * q = find_name(a); + int type; + if (q) { + type = q->type; + } else { + /* No declaration, so sniff next token - if it is 'as' then parse + * as a routine, otherwise as a grouping. + */ + if (read_token(a->tokeniser) == c_as) { + type = t_routine; + } else { + type = t_grouping; + } + a->tokeniser->token_held = true; + } + + if (type == t_grouping) { + read_define_grouping(a, q); + } else { + read_define_routine(a, q); + } + } +} + +static void read_backwardmode(struct analyser * a) { + int mode = a->mode; + a->mode = m_backward; + if (get_token(a, c_bra)) { + read_program_(a, c_ket); + check_token(a, c_ket); + } + a->mode = mode; +} + +static void read_program_(struct analyser * a, int terminator) { + struct tokeniser * t = a->tokeniser; + while (true) { + switch (read_token(t)) { + case c_strings: read_names(a, t_string); break; + case c_booleans: read_names(a, t_boolean); break; + case c_integers: read_names(a, t_integer); break; + case c_routines: read_names(a, t_routine); break; + case c_externals: read_names(a, t_external); break; + case c_groupings: read_names(a, t_grouping); break; + case c_define: read_define(a); break; + case c_backwardmode:read_backwardmode(a); break; + case c_ket: + if (terminator == c_ket) return; + /* fall through */ + default: + error(a, e_unexpected_token); break; + case -1: + if (terminator >= 0) omission_error(a, c_ket); + return; + } + } +} + +static void remove_dead_assignments(struct node * p, struct name * q) { + if (p->name == q) { + switch (p->type) { + case c_assignto: + case c_sliceto: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_setmark: + case c_set: + case c_unset: + case c_dollar: + /* c_true is a no-op. */ + p->type = c_true; + break; + default: + /* There are no read accesses to this variable, so any + * references must be assignments. + */ + fprintf(stderr, "Unhandled type of dead assignment via %s\n", + name_of_token(p->type)); + exit(1); + } + } + if (p->AE) remove_dead_assignments(p->AE, q); + if (p->left) remove_dead_assignments(p->left, q); + if (p->aux) remove_dead_assignments(p->aux, q); + if (p->right) remove_dead_assignments(p->right, q); +} + +extern void read_program(struct analyser * a) { + read_program_(a, -1); + { + struct name * q = a->names; + while (q) { + switch (q->type) { + case t_external: case t_routine: + if (q->used && q->definition == 0) error4(a, q); + break; + case t_grouping: + if (q->used && q->grouping == 0) error4(a, q); + break; + } + q = q->next; + } + } + + if (a->tokeniser->error_count == 0) { + struct name * q = a->names; + struct name ** ptr = &(a->names); + while (q) { + if (!q->referenced) { + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + if (q->type == t_routine || + q->type == t_external || + q->type == t_grouping) { + fprintf(stderr, "' declared but not defined\n"); + } else { + fprintf(stderr, "' defined but not used\n"); + q = q->next; + *ptr = q; + continue; + } + } else if (q->type == t_routine || q->type == t_grouping) { + /* It's OK to define a grouping but only use it to define other + * groupings. + */ + if (!q->used && !q->used_in_definition) { + int line_num; + if (q->type == t_routine) { + line_num = q->definition->line_number; + } else { + line_num = q->grouping->line_number; + } + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + line_num, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' defined but not used\n"); + } + } else if (q->type == t_external) { + /* Unused is OK. */ + } else if (!q->initialised) { + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' is never initialised\n"); + } else if (!q->value_used) { + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' is set but never used\n"); + remove_dead_assignments(a->program, q); + q = q->next; + *ptr = q; + continue; + } + ptr = &(q->next); + q = q->next; + } + + { + /* Now we've eliminated variables whose values are never used we + * can number the variables, which is used by some generators. + */ + int * name_count = a->name_count; + struct name * n; + for (n = a->names; n; n = n->next) { + n->count = name_count[n->type]++; + } + } + } +} + +extern struct analyser * create_analyser(struct tokeniser * t) { + NEW(analyser, a); + a->tokeniser = t; + a->nodes = 0; + a->names = 0; + a->literalstrings = 0; + a->program = 0; + a->amongs = 0; + a->among_count = 0; + a->groupings = 0; + a->mode = m_forward; + a->modifyable = true; + { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; } + a->substring = 0; + a->int_limits_used = false; + return a; +} + +extern void close_analyser(struct analyser * a) { + { + struct node * q = a->nodes; + while (q) { + struct node * q_next = q->next; + FREE(q); + q = q_next; + } + } + { + struct name * q = a->names; + while (q) { + struct name * q_next = q->next; + lose_b(q->b); FREE(q); + q = q_next; + } + } + { + struct literalstring * q = a->literalstrings; + while (q) { + struct literalstring * q_next = q->next; + lose_b(q->b); FREE(q); + q = q_next; + } + } + { + struct among * q = a->amongs; + while (q) { + struct among * q_next = q->next; + FREE(q->b); + FREE(q->commands); + FREE(q); + q = q_next; + } + } + { + struct grouping * q = a->groupings; + while (q) { + struct grouping * q_next = q->next; + lose_b(q->b); FREE(q); + q = q_next; + } + } + FREE(a); +} diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c new file mode 100644 index 0000000..587028f --- /dev/null +++ b/contrib/snowball/compiler/driver.c @@ -0,0 +1,574 @@ +#include <ctype.h> /* for toupper etc */ +#include <stdio.h> /* for fprintf etc */ +#include <stdlib.h> /* for free etc */ +#include <string.h> /* for strcmp */ +#include "header.h" + +#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" +#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" +#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" +#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" + +#define DEFAULT_GO_PACKAGE "snowball" +#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" + +#define DEFAULT_CS_NAMESPACE "Snowball" +#define DEFAULT_CS_BASE_CLASS "Stemmer" +#define DEFAULT_CS_AMONG_CLASS "Among" +#define DEFAULT_CS_STRING_CLASS "StringBuilder" + +#define DEFAULT_JS_BASE_CLASS "BaseStemmer" + +#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" + +static int eq(const char * s1, const char * s2) { + return strcmp(s1, s2) == 0; +} + +__attribute__((noreturn)) +static void print_arglist(int exit_code) { + FILE * f = exit_code ? stderr : stdout; + fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" + "Supported options:\n" + " -o[utput] file\n" + " -s[yntax]\n" + " -comments\n" +#ifndef DISABLE_JAVA + " -j[ava]\n" +#endif +#ifndef DISABLE_CSHARP + " -cs[harp]\n" +#endif + " -c++\n" +#ifndef DISABLE_PASCAL + " -pascal\n" +#endif +#ifndef DISABLE_PYTHON + " -py[thon]\n" +#endif +#ifndef DISABLE_JS + " -js\n" +#endif +#ifndef DISABLE_RUST + " -rust\n" +#endif +#ifndef DISABLE_GO + " -go\n" +#endif + " -w[idechars]\n" + " -u[tf8]\n" + " -n[ame] class name\n" + " -ep[refix] string\n" + " -vp[refix] string\n" + " -i[nclude] directory\n" + " -r[untime] path to runtime headers\n" + " -p[arentclassname] fully qualified parent class name\n" +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) + " -P[ackage] package name for stemmers\n" + " -S[tringclass] StringBuffer-compatible class\n" + " -a[mongclass] fully qualified name of the Among class\n" +#endif +#ifndef DISABLE_GO + " -gop[ackage] Go package name for stemmers\n" + " -gor[untime] Go snowball runtime package\n" +#endif + " --help display this help and exit\n" + " --version output version information and exit\n" + ); + exit(exit_code); +} + +static void check_lim(int i, int argc) { + if (i >= argc) { + fprintf(stderr, "argument list is one short\n"); + print_arglist(1); + } +} + +static FILE * get_output(symbol * b) { + char * s = b_to_s(b); + FILE * output = fopen(s, "w"); + if (output == 0) { + fprintf(stderr, "Can't open output %s\n", s); + exit(1); + } + free(s); + return output; +} + +static int read_options(struct options * o, int argc, char * argv[]) { + char * s; + int i = 1; + int new_argc = 1; + /* Note down the last option used to specify an explicit encoding so + * we can warn we ignored it for languages with a fixed encoding. + */ + const char * encoding_opt = NULL; + + /* set defaults: */ + + o->output_file = 0; + o->syntax_tree = false; + o->comments = false; + o->externals_prefix = NULL; + o->variables_prefix = 0; + o->runtime_path = 0; + o->parent_class_name = NULL; + o->string_class = NULL; + o->among_class = NULL; + o->package = NULL; + o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; + o->name = NULL; + o->make_lang = LANG_C; + o->includes = 0; + o->includes_end = 0; + o->encoding = ENC_SINGLEBYTE; + + /* read options: */ + + while (i < argc) { + s = argv[i++]; + if (s[0] != '-') { + /* Non-option argument - shuffle down. */ + argv[new_argc++] = s; + continue; + } + + { + if (eq(s, "-o") || eq(s, "-output")) { + check_lim(i, argc); + o->output_file = argv[i++]; + continue; + } + if (eq(s, "-n") || eq(s, "-name")) { + check_lim(i, argc); + o->name = argv[i++]; + continue; + } +#ifndef DISABLE_JS + if (eq(s, "-js")) { + o->make_lang = LANG_JAVASCRIPT; + continue; + } +#endif +#ifndef DISABLE_RUST + if (eq(s, "-rust")) { + o->make_lang = LANG_RUST; + continue; + } +#endif +#ifndef DISABLE_GO + if (eq(s, "-go")) { + o->make_lang = LANG_GO; + continue; + } +#endif +#ifndef DISABLE_JAVA + if (eq(s, "-j") || eq(s, "-java")) { + o->make_lang = LANG_JAVA; + continue; + } +#endif +#ifndef DISABLE_CSHARP + if (eq(s, "-cs") || eq(s, "-csharp")) { + o->make_lang = LANG_CSHARP; + continue; + } +#endif + if (eq(s, "-c++")) { + o->make_lang = LANG_CPLUSPLUS; + continue; + } +#ifndef DISABLE_PASCAL + if (eq(s, "-pascal")) { + o->make_lang = LANG_PASCAL; + continue; + } +#endif +#ifndef DISABLE_PYTHON + if (eq(s, "-py") || eq(s, "-python")) { + o->make_lang = LANG_PYTHON; + continue; + } +#endif + if (eq(s, "-w") || eq(s, "-widechars")) { + encoding_opt = s; + o->encoding = ENC_WIDECHARS; + continue; + } + if (eq(s, "-s") || eq(s, "-syntax")) { + o->syntax_tree = true; + continue; + } + if (eq(s, "-comments")) { + o->comments = true; + continue; + } + if (eq(s, "-ep") || eq(s, "-eprefix")) { + check_lim(i, argc); + o->externals_prefix = argv[i++]; + continue; + } + if (eq(s, "-vp") || eq(s, "-vprefix")) { + check_lim(i, argc); + o->variables_prefix = argv[i++]; + continue; + } + if (eq(s, "-i") || eq(s, "-include")) { + check_lim(i, argc); + + { + NEW(include, p); + symbol * b = add_s_to_b(0, argv[i++]); + b = add_s_to_b(b, "/"); + p->next = 0; p->b = b; + + if (o->includes == 0) o->includes = p; else + o->includes_end->next = p; + o->includes_end = p; + } + continue; + } + if (eq(s, "-r") || eq(s, "-runtime")) { + check_lim(i, argc); + o->runtime_path = argv[i++]; + continue; + } + if (eq(s, "-u") || eq(s, "-utf8")) { + encoding_opt = s; + o->encoding = ENC_UTF8; + continue; + } + if (eq(s, "-p") || eq(s, "-parentclassname")) { + check_lim(i, argc); + o->parent_class_name = argv[i++]; + continue; + } +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) + if (eq(s, "-P") || eq(s, "-Package")) { + check_lim(i, argc); + o->package = argv[i++]; + continue; + } + if (eq(s, "-S") || eq(s, "-stringclass")) { + check_lim(i, argc); + o->string_class = argv[i++]; + continue; + } + if (eq(s, "-a") || eq(s, "-amongclass")) { + check_lim(i, argc); + o->among_class = argv[i++]; + continue; + } +#endif +#ifndef DISABLE_GO + if (eq(s, "-gop") || eq(s, "-gopackage")) { + check_lim(i, argc); + o->package = argv[i++]; + continue; + } + if (eq(s, "-gor") || eq(s, "-goruntime")) { + check_lim(i, argc); + o->go_snowball_runtime = argv[i++]; + continue; + } +#endif + if (eq(s, "--help")) { + print_arglist(0); + } + + if (eq(s, "--version")) { + printf("Snowball compiler version " SNOWBALL_VERSION "\n"); + exit(0); + } + + fprintf(stderr, "'%s' misplaced\n", s); + print_arglist(1); + } + } + if (new_argc == 1) { + fprintf(stderr, "no source files specified\n"); + print_arglist(1); + } + argv[new_argc] = NULL; + + /* Set language-dependent defaults. */ + switch (o->make_lang) { + case LANG_C: + case LANG_CPLUSPLUS: + encoding_opt = NULL; + break; + case LANG_CSHARP: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_CS_BASE_CLASS; + if (!o->string_class) + o->string_class = DEFAULT_CS_STRING_CLASS; + if (!o->among_class) + o->among_class = DEFAULT_CS_AMONG_CLASS; + if (!o->package) + o->package = DEFAULT_CS_NAMESPACE; + break; + case LANG_GO: + o->encoding = ENC_UTF8; + if (!o->package) + o->package = DEFAULT_GO_PACKAGE; + break; + case LANG_JAVA: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; + if (!o->string_class) + o->string_class = DEFAULT_JAVA_STRING_CLASS; + if (!o->among_class) + o->among_class = DEFAULT_JAVA_AMONG_CLASS; + if (!o->package) + o->package = DEFAULT_JAVA_PACKAGE; + break; + case LANG_JAVASCRIPT: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_JS_BASE_CLASS; + break; + case LANG_PYTHON: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; + break; + case LANG_RUST: + o->encoding = ENC_UTF8; + break; + default: + break; + } + + if (encoding_opt) { + fprintf(stderr, "warning: %s only meaningful for C and C++\n", + encoding_opt); + } + + if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { + if (o->runtime_path) { + fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); + } + if (o->externals_prefix) { + fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n"); + } + } + if (!o->externals_prefix) o->externals_prefix = ""; + + if (!o->name && o->output_file) { + /* Default class name to basename of output_file - this is the standard + * convention for at least Java and C#. + */ + const char * slash = strrchr(o->output_file, '/'); + size_t len; + const char * leaf = (slash == NULL) ? o->output_file : slash + 1; + + slash = strrchr(leaf, '\\'); + if (slash != NULL) leaf = slash + 1; + + { + const char * dot = strchr(leaf, '.'); + len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); + } + + { + char * new_name = malloc(len + 1); + switch (o->make_lang) { + case LANG_CSHARP: + case LANG_PASCAL: + /* Upper case initial letter. */ + memcpy(new_name, leaf, len); + new_name[0] = toupper(new_name[0]); + break; + case LANG_JAVASCRIPT: + case LANG_PYTHON: { + /* Upper case initial letter and change each + * underscore+letter or hyphen+letter to an upper case + * letter. + */ + size_t i, j = 0; + int uc_next = true; + for (i = 0; i != len; ++i) { + unsigned char ch = leaf[i]; + if (ch == '_' || ch == '-') { + uc_next = true; + } else { + if (uc_next) { + new_name[j] = toupper(ch); + uc_next = false; + } else { + new_name[j] = ch; + } + ++j; + } + } + len = j; + break; + } + default: + /* Just copy. */ + memcpy(new_name, leaf, len); + break; + } + new_name[len] = '\0'; + o->name = new_name; + } + } + + return new_argc; +} + +extern int main(int argc, char * argv[]) { + + int i; + NEW(options, o); + argc = read_options(o, argc, argv); + { + char * file = argv[1]; + symbol * u = get_input(file); + if (u == 0) { + fprintf(stderr, "Can't open input %s\n", file); + exit(1); + } + { + struct tokeniser * t = create_tokeniser(u, file); + struct analyser * a = create_analyser(t); + struct input ** next_input_ptr = &(t->next); + a->encoding = t->encoding = o->encoding; + t->includes = o->includes; + /* If multiple source files are specified, set up the others to be + * read after the first in order, using the same mechanism as + * 'get' uses. */ + for (i = 2; i != argc; ++i) { + NEW(input, q); + file = argv[i]; + u = get_input(file); + if (u == 0) { + fprintf(stderr, "Can't open input %s\n", file); + exit(1); + } + q->p = u; + q->c = 0; + q->file = file; + q->file_needs_freeing = false; + q->line_number = 1; + *next_input_ptr = q; + next_input_ptr = &(q->next); + } + *next_input_ptr = NULL; + read_program(a); + if (t->error_count > 0) exit(1); + if (o->syntax_tree) print_program(a); + close_tokeniser(t); + if (!o->syntax_tree) { + struct generator * g; + + const char * s = o->output_file; + if (!s) { + fprintf(stderr, "Please include the -o option\n"); + print_arglist(1); + } + g = create_generator(a, o); + if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".h"); + o->output_h = get_output(b); + b[SIZE(b) - 1] = 'c'; + if (o->make_lang == LANG_CPLUSPLUS) { + b = add_s_to_b(b, "c"); + } + o->output_src = get_output(b); + lose_b(b); + + generate_program_c(g); + fclose(o->output_src); + fclose(o->output_h); + } +#ifndef DISABLE_JAVA + if (o->make_lang == LANG_JAVA) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".java"); + o->output_src = get_output(b); + lose_b(b); + generate_program_java(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_PASCAL + if (o->make_lang == LANG_PASCAL) { + symbol *b = add_s_to_b(0, s); + b = add_s_to_b(b, ".pas"); + o->output_src = get_output(b); + lose_b(b); + generate_program_pascal(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_PYTHON + if (o->make_lang == LANG_PYTHON) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".py"); + o->output_src = get_output(b); + lose_b(b); + generate_program_python(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_JS + if (o->make_lang == LANG_JAVASCRIPT) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".js"); + o->output_src = get_output(b); + lose_b(b); + generate_program_js(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_CSHARP + if (o->make_lang == LANG_CSHARP) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".cs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_csharp(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_RUST + if (o->make_lang == LANG_RUST) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".rs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_rust(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_GO + if (o->make_lang == LANG_GO) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".go"); + o->output_src = get_output(b); + lose_b(b); + generate_program_go(g); + fclose(o->output_src); + } +#endif + close_generator(g); + } + close_analyser(a); + } + lose_b(u); + } + { struct include * p = o->includes; + while (p) { + struct include * q = p->next; + lose_b(p->b); FREE(p); p = q; + } + } + FREE(o); + if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); + return 0; +} diff --git a/contrib/snowball/compiler/generator.c b/contrib/snowball/compiler/generator.c new file mode 100644 index 0000000..eed86c1 --- /dev/null +++ b/contrib/snowball/compiler/generator.c @@ -0,0 +1,1725 @@ + +#include <limits.h> /* for INT_MAX */ +#include <stdio.h> /* for fprintf etc */ +#include <stdlib.h> /* for free etc */ +#include <string.h> /* for strlen */ +#include "header.h" + +/* Define this to get warning messages when optimisations can't be used. */ +/* #define OPTIMISATION_WARNINGS */ + +/* recursive use: */ + +static void generate(struct generator * g, struct node * p); + +static int new_label(struct generator * g) { + return g->next_label++; +} + +/* Write routines for simple entities */ + +/* Write a space if the preceding character was not whitespace */ +static void ws_opt_space(struct generator * g, const char * s) { + int ch = str_back(g->outbuf); + if (ch != ' ' && ch != '\n' && ch != '\t' && ch != -1) + write_char(g, ' '); + write_string(g, s); +} + +static void wi3(struct generator * g, int i) { + if (i < 100) write_char(g, ' '); + if (i < 10) write_char(g, ' '); + write_int(g, i); /* integer (width 3) */ +} + + +/* Write routines for items from the syntax tree */ + +static void write_varname(struct generator * g, struct name * p) { + + int ch = "SIIrxg"[p->type]; + switch (p->type) { + case t_external: + write_string(g, g->options->externals_prefix); break; + case t_string: + case t_boolean: + case t_integer: { + int count = p->count; + if (count < 0) { + fprintf(stderr, "Reference to optimised out variable "); + report_b(stderr, p->b); + fprintf(stderr, " attempted\n"); + exit(1); + } + if (p->type == t_boolean) { + /* We use a single array for booleans and integers, with the + * integers first. + */ + count += g->analyser->name_count[t_integer]; + } + write_char(g, ch); + write_char(g, '['); + write_int(g, count); + write_char(g, ']'); + return; + } + default: + write_char(g, ch); write_char(g, '_'); + } + write_b(g, p->b); +} + +static void write_varref(struct generator * g, struct name * p) { /* reference to variable */ + if (p->type < t_routine) write_string(g, "z->"); + write_varname(g, p); +} + +static void write_hexdigit(struct generator * g, int i) { + str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ +} + +static void write_hex(struct generator * g, int i) { + if (i >> 4) write_hex(g, i >> 4); + write_hexdigit(g, i); /* hex integer */ +} + +/* write character literal */ +static void wlitch(struct generator * g, int ch) { + if (32 <= ch && ch < 127) { + write_char(g, '\''); + if (ch == '\'' || ch == '\\') { + write_char(g, '\\'); + } + write_char(g, ch); + write_char(g, '\''); + } else { + write_string(g, "0x"); write_hex(g, ch); + } +} + +static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ + + write_string(g, "{ "); + { + int i; + for (i = 0; i < SIZE(p); i++) { + wlitch(g, p[i]); + if (i < SIZE(p) - 1) write_string(g, ", "); + } + } + write_string(g, " }"); +} + +static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ + + if (SIZE(p) == 0) { + write_char(g, '0'); + } else { + struct str * s = g->outbuf; + g->outbuf = g->declarations; + write_string(g, "static const symbol s_"); write_int(g, g->literalstring_count); write_string(g, "[] = "); + wlitarray(g, p); + write_string(g, ";\n"); + g->outbuf = s; + write_string(g, "s_"); write_int(g, g->literalstring_count); + g->literalstring_count++; + } +} + +static void write_margin(struct generator * g) { + int i; + for (i = 0; i < g->margin; i++) write_string(g, " "); +} + +void write_comment_content(struct generator * g, struct node * p) { + switch (p->type) { + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + if (p->name) { + write_char(g, '$'); + write_b(g, p->name->b); + write_char(g, ' '); + } + write_string(g, name_of_token(p->type)); + write_string(g, " <integer expression>"); + break; + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + write_string(g, "$(<integer expression> "); + write_string(g, name_of_token(p->type)); + write_string(g, " <integer expression>)"); + break; + default: + write_string(g, name_of_token(p->type)); + if (p->name) { + write_char(g, ' '); + write_b(g, p->name->b); + } + } + write_string(g, ", line "); + write_int(g, p->line_number); +} + +static void write_comment(struct generator * g, struct node * p) { + if (g->options->comments) { + ws_opt_space(g, "/* "); + write_comment_content(g, p); + write_string(g, " */"); + } + write_newline(g); +} + +static void wms(struct generator * g, const char * s) { + write_margin(g); write_string(g, s); } /* margin + string */ + +static void write_block_start(struct generator * g) { /* block start */ + wms(g, "{ "); + g->margin++; +} + +static void write_block_end(struct generator * g) { /* block end */ + + if (g->line_labelled == g->line_count) { wms(g, ";"); write_newline(g); } + g->margin--; + wms(g, "}"); write_newline(g); +} + +static void w(struct generator * g, const char * s); + +/* keep c */ +static void wk(struct generator * g, struct node * p, int keep_limit) { + ++g->keep_count; + if (p->mode == m_forward) { + write_string(g, "int c"); + write_int(g, g->keep_count); + write_string(g, " = z->c"); + if (keep_limit) { + write_string(g, ", mlimit"); + write_int(g, g->keep_count); + } + write_char(g, ';'); + } else { + write_string(g, "int m"); + write_int(g, g->keep_count); + write_string(g, " = z->l - z->c"); + if (keep_limit) { + write_string(g, ", mlimit"); + write_int(g, g->keep_count); + } + write_string(g, "; (void)m"); + write_int(g, g->keep_count); + write_char(g, ';'); + } +} + +static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */ + if (p->mode == m_forward) { + write_string(g, "z->c = c"); + } else { + write_string(g, "z->c = z->l - m"); + } + write_int(g, keep_token); write_char(g, ';'); +} + +static void wrestorelimit(struct generator * g, struct node * p, int keep_token) { /* restore limit */ + if (p->mode == m_forward) { + w(g, "z->l += mlimit"); + } else { + w(g, "z->lb = mlimit"); + } + write_int(g, keep_token); write_string(g, ";"); +} + +static void winc(struct generator * g, struct node * p) { /* increment c */ + write_string(g, p->mode == m_forward ? "z->c++;" : + "z->c--;"); +} + +static void wsetl(struct generator * g, int n) { + + g->margin--; + wms(g, "lab"); write_int(g, n); write_char(g, ':'); write_newline(g); + g->line_labelled = g->line_count; + g->margin++; +} + +static void wgotol(struct generator * g, int n) { + wms(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g); +} + +static void write_failure(struct generator * g, struct node * p) { /* fail */ + if (g->failure_keep_count != 0) { + write_string(g, "{ "); + if (g->failure_keep_count > 0) { + wrestore(g, p, g->failure_keep_count); + } else { + wrestorelimit(g, p, -g->failure_keep_count); + } + write_char(g, ' '); + } + switch (g->failure_label) { + case x_return: + write_string(g, "return 0;"); + break; + default: + write_string(g, "goto lab"); + write_int(g, g->failure_label); + write_char(g, ';'); + g->label_used = 1; + } + if (g->failure_keep_count != 0) write_string(g, " }"); +} + + +/* if at limit fail */ +static void write_check_limit(struct generator * g, struct node * p) { + + write_string(g, p->mode == m_forward ? "if (z->c >= z->l) " : + "if (z->c <= z->lb) "); + write_failure(g, p); +} + +static void write_data_address(struct generator * g, struct node * p) { + symbol * b = p->literalstring; + if (b != 0) { + write_int(g, SIZE(b)); w(g, ", "); + wlitref(g, b); + } else { + write_varref(g, p->name); + } +} + +/* Formatted write. */ +static void writef(struct generator * g, const char * input, struct node * p) { + int i = 0; + int l = strlen(input); + + while (i < l) { + int ch = input[i++]; + if (ch != '~') { + write_char(g, ch); + continue; + } + switch (input[i++]) { + default: write_char(g, input[i - 1]); continue; + case 'C': write_comment(g, p); continue; + case 'k': wk(g, p, false); continue; + case 'K': wk(g, p, true); continue; + case 'i': winc(g, p); continue; + case 'l': write_check_limit(g, p); continue; + case 'f': write_failure(g, p); continue; + case 'M': write_margin(g); continue; + case 'N': write_newline(g); continue; + case '{': write_block_start(g); continue; + case '}': write_block_end(g); continue; + case 'S': write_string(g, g->S[input[i++] - '0']); continue; + case 'I': write_int(g, g->I[input[i++] - '0']); continue; + case 'J': wi3(g, g->I[input[i++] - '0']); continue; + case 'V': write_varref(g, g->V[input[i++] - '0']); continue; + case 'W': write_varname(g, g->V[input[i++] - '0']); continue; + case 'L': wlitref(g, g->L[input[i++] - '0']); continue; + case 'A': wlitarray(g, g->L[input[i++] - '0']); continue; + case 'c': wlitch(g, g->I[input[i++] - '0']); continue; + case 'a': write_data_address(g, p); continue; + case '+': g->margin++; continue; + case '-': g->margin--; continue; + case '$': /* insert_s, insert_v etc */ + write_char(g, p->literalstring == 0 ? 'v' : 's'); + continue; + case 'p': write_string(g, g->options->externals_prefix); continue; + } + } +} + +static void w(struct generator * g, const char * s) { + writef(g, s, 0); +} + +static void generate_AE(struct generator * g, struct node * p) { + const char * s; + switch (p->type) { + case c_name: + write_varref(g, p->name); break; + case c_number: + write_int(g, p->number); break; + case c_maxint: + write_string(g, "MAXINT"); break; + case c_minint: + write_string(g, "MININT"); break; + case c_neg: + write_char(g, '-'); generate_AE(g, p->right); break; + case c_multiply: + s = " * "; goto label0; + case c_plus: + s = " + "; goto label0; + case c_minus: + s = " - "; goto label0; + case c_divide: + s = " / "; + label0: + write_char(g, '('); generate_AE(g, p->left); + write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; + case c_cursor: + w(g, "z->c"); break; + case c_limit: + w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; + case c_len: + if (g->options->encoding == ENC_UTF8) { + w(g, "len_utf8(z->p)"); + break; + } + /* FALLTHRU */ + case c_size: + w(g, "SIZE(z->p)"); + break; + case c_lenof: + if (g->options->encoding == ENC_UTF8) { + g->V[0] = p->name; + w(g, "len_utf8(~V0)"); + break; + } + /* FALLTHRU */ + case c_sizeof: + g->V[0] = p->name; + w(g, "SIZE(~V0)"); + break; + } +} + +/* K_needed() tests to see if we really need to keep c. Not true when the + command does not touch the cursor. This and repeat_score() could be + elaborated almost indefinitely. +*/ + +static int K_needed_(struct generator * g, struct node * p, int call_depth) { + while (p) { + switch (p->type) { + case c_atlimit: + case c_do: + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: + case c_booltest: + case c_set: + case c_unset: + case c_true: + case c_false: + case c_debug: + break; + + case c_call: + /* Recursive functions aren't typical in snowball programs, so + * make the pessimistic assumption that keep is needed if we + * hit a generous limit on recursion. It's not likely to make + * a difference to any real world program, but means we won't + * recurse until we run out of stack for pathological cases. + */ + if (call_depth >= 100) return true; + if (K_needed_(g, p->name->definition, call_depth + 1)) + return true; + break; + + case c_bra: + if (K_needed_(g, p->left, call_depth)) return true; + break; + + default: return true; + } + p = p->right; + } + return false; +} + +extern int K_needed(struct generator * g, struct node * p) { + return K_needed_(g, p, 0); +} + +static int repeat_score(struct generator * g, struct node * p, int call_depth) { + int score = 0; + while (p) { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: /* case c_not: must not be included here! */ + case c_debug: + break; + + case c_call: + /* Recursive functions aren't typical in snowball programs, so + * make the pessimistic assumption that repeat requires cursor + * reinstatement if we hit a generous limit on recursion. It's + * not likely to make a difference to any real world program, + * but means we won't recurse until we run out of stack for + * pathological cases. + */ + if (call_depth >= 100) { + return 2; + } + score += repeat_score(g, p->name->definition, call_depth + 1); + if (score >= 2) + return score; + break; + + case c_bra: + score += repeat_score(g, p->left, call_depth); + if (score >= 2) + return score; + break; + + case c_name: + case c_literalstring: + case c_next: + case c_grouping: + case c_non: + case c_hop: + if (++score >= 2) + return score; + break; + + default: + return 2; + } + p = p->right; + } + return score; +} + +/* tests if an expression requires cursor reinstatement in a repeat */ + +extern int repeat_restore(struct generator * g, struct node * p) { + return repeat_score(g, p, 0) >= 2; +} + +static void generate_bra(struct generator * g, struct node * p) { + p = p->left; + while (p) { + generate(g, p); + p = p->right; + } +} + +static void generate_and(struct generator * g, struct node * p) { + int keep_c = 0; + if (K_needed(g, p->left)) { + writef(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + writef(g, "~M~C", p); + } + p = p->left; + while (p) { + generate(g, p); + if (keep_c && p->right != 0) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + p = p->right; + } + if (keep_c) w(g, "~}"); +} + +static void generate_or(struct generator * g, struct node * p) { + int keep_c = 0; + + int used = g->label_used; + int a0 = g->failure_label; + int a1 = g->failure_keep_count; + + int out_lab = new_label(g); + + if (K_needed(g, p->left)) { + writef(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + writef(g, "~M~C", p); + } + p = p->left; + g->failure_keep_count = 0; + while (p->right) { + g->failure_label = new_label(g); + g->label_used = 0; + generate(g, p); + wgotol(g, out_lab); + if (g->label_used) + wsetl(g, g->failure_label); + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + p = p->right; + } + g->label_used = used; + g->failure_label = a0; + g->failure_keep_count = a1; + + generate(g, p); + if (keep_c) w(g, "~}"); + wsetl(g, out_lab); +} + +static void generate_backwards(struct generator * g, struct node * p) { + + writef(g, "~Mz->lb = z->c; z->c = z->l;~C~N", p); + generate(g, p->left); + w(g, "~Mz->c = z->lb;~N"); +} + + +static void generate_not(struct generator * g, struct node * p) { + int keep_c = 0; + + int used = g->label_used; + int a0 = g->failure_label; + int a1 = g->failure_keep_count; + + if (K_needed(g, p->left)) { + writef(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + writef(g, "~M~C", p); + } + + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_keep_count = 0; + generate(g, p->left); + + { + int l = g->failure_label; + int u = g->label_used; + + g->label_used = used; + g->failure_label = a0; + g->failure_keep_count = a1; + + writef(g, "~M~f~N", p); + if (u) + wsetl(g, l); + } + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); + } +} + + +static void generate_try(struct generator * g, struct node * p) { + int keep_c = 0; + if (K_needed(g, p->left)) { + writef(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + writef(g, "~M~C", p); + } + g->failure_keep_count = keep_c; + + g->failure_label = new_label(g); + g->label_used = 0; + generate(g, p->left); + + if (g->label_used) + wsetl(g, g->failure_label); + + if (keep_c) w(g, "~}"); +} + +static void generate_set(struct generator * g, struct node * p) { + g->V[0] = p->name; writef(g, "~M~V0 = 1;~C", p); +} + +static void generate_unset(struct generator * g, struct node * p) { + g->V[0] = p->name; writef(g, "~M~V0 = 0;~C", p); +} + +static void generate_fail(struct generator * g, struct node * p) { + generate(g, p->left); + writef(g, "~M~f~C", p); +} + +/* generate_test() also implements 'reverse' */ + +static void generate_test(struct generator * g, struct node * p) { + int keep_c = 0; + if (K_needed(g, p->left)) { + keep_c = ++g->keep_count; + w(g, p->mode == m_forward ? "~{int c_test" : + "~{int m_test"); + write_int(g, keep_c); + w(g, p->mode == m_forward ? " = z->c;" : + " = z->l - z->c;"); + writef(g, "~C", p); + } else writef(g, "~M~C", p); + + generate(g, p->left); + + if (keep_c) { + w(g, p->mode == m_forward ? "~Mz->c = c_test" : + "~Mz->c = z->l - m_test"); + write_int(g, keep_c); + writef(g, ";~N~}", p); + } +} + +static void generate_do(struct generator * g, struct node * p) { + int keep_c = 0; + if (K_needed(g, p->left)) { + writef(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + writef(g, "~M~C", p); + } + + if (p->left->type == c_call) { + /* Optimise do <call> */ + g->V[0] = p->left->name; + writef(g, "~{int ret = ~V0(z);~C", p->left); + w(g, "~Mif (ret < 0) return ret;~N~}"); + } else { + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_keep_count = 0; + generate(g, p->left); + + if (g->label_used) + wsetl(g, g->failure_label); + } + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); + w(g, "~N~}"); + } +} + +static void generate_next(struct generator * g, struct node * p) { + if (g->options->encoding == ENC_UTF8) { + if (p->mode == m_forward) + w(g, "~{int ret = skip_utf8(z->p, z->c, 0, z->l, 1"); + else + w(g, "~{int ret = skip_utf8(z->p, z->c, z->lb, 0, -1"); + writef(g, ");~N" + "~Mif (ret < 0) ~f~N" + "~Mz->c = ret;~C" + "~}", p); + } else + writef(g, "~M~l~N" + "~M~i~C", p); +} + +static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { + + struct grouping * q = p->name->grouping; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->S[1] = complement ? "in" : "out"; + g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; + g->V[0] = p->name; + g->I[0] = q->smallest_ch; + g->I[1] = q->largest_ch; + if (is_goto) { + writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f~C", p); + } else { + writef(g, "~{~C" + "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" + "~Mif (ret < 0) ~f~N", p); + if (p->mode == m_forward) + w(g, "~Mz->c += ret;~N"); + else + w(g, "~Mz->c -= ret;~N"); + w(g, "~}"); + } +} + +static void generate_GO(struct generator * g, struct node * p, int style) { + int keep_c = 0; + + int used = g->label_used; + int a0 = g->failure_label; + int a1 = g->failure_keep_count; + + if (p->left->type == c_grouping || p->left->type == c_non) { + /* Special case for "goto" or "gopast" when used on a grouping or an + * inverted grouping - the movement of c by the matching action is + * exactly what we want! */ +#ifdef OPTIMISATION_WARNINGS + printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); +#endif + if (g->options->comments) { + writef(g, "~M~C", p); + } + generate_GO_grouping(g, p->left, style, p->left->type == c_non); + return; + } + + w(g, "~Mwhile(1) {"); writef(g, "~C~+", p); + + if (style == 1 || repeat_restore(g, p->left)) { + writef(g, "~M~k~N", p); + keep_c = g->keep_count; + } + + g->failure_label = new_label(g); + g->label_used = 0; + generate(g, p->left); + + if (style == 1) { + /* include for goto; omit for gopast */ + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + w(g, "~Mbreak;~N"); + if (g->label_used) + wsetl(g, g->failure_label); + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + + g->label_used = used; + g->failure_label = a0; + g->failure_keep_count = a1; + +/* writef(g, "~M~l~N" + "~M~i~N", p); */ + generate_next(g, p); + w(g, "~}"); +} + +static void generate_loop(struct generator * g, struct node * p) { + w(g, "~{int i; for (i = "); generate_AE(g, p->AE); writef(g, "; i > 0; i--)~C" + "~{", p); + + generate(g, p->left); + + w(g, "~}" + "~}"); +} + +static void generate_repeat_or_atleast(struct generator * g, struct node * p, int atleast_case) { + int keep_c = 0; + if (atleast_case) { + writef(g, "~Mwhile(1) {~+~N", p); + } else { + writef(g, "~Mwhile(1) {~+~C", p); + } + + if (repeat_restore(g, p->left)) { + writef(g, "~M~k~N", p); + keep_c = g->keep_count; + } + + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_keep_count = 0; + generate(g, p->left); + + if (atleast_case) w(g, "~Mi--;~N"); + + w(g, "~Mcontinue;~N"); + if (g->label_used) + wsetl(g, g->failure_label); + + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + + w(g, "~Mbreak;~N" + "~}"); +} + +static void generate_repeat(struct generator * g, struct node * p) { + generate_repeat_or_atleast(g, p, false); +} + +static void generate_atleast(struct generator * g, struct node * p) { + w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~C"); + { + int used = g->label_used; + int a0 = g->failure_label; + int a1 = g->failure_keep_count; + + generate_repeat_or_atleast(g, p, true); + + g->label_used = used; + g->failure_label = a0; + g->failure_keep_count = a1; + } + writef(g, "~Mif (i > 0) ~f~N" + "~}", p); +} + +static void generate_setmark(struct generator * g, struct node * p) { + g->V[0] = p->name; + writef(g, "~M~V0 = z->c;~C", p); +} + +static void generate_tomark(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? ">" : "<"; + + w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); writef(g, ") ~f~N", p); + w(g, "~Mz->c = "); generate_AE(g, p->AE); writef(g, ";~C", p); +} + +static void generate_atmark(struct generator * g, struct node * p) { + + w(g, "~Mif (z->c != "); generate_AE(g, p->AE); writef(g, ") ~f~C", p); +} + +static void generate_hop(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "+" : "-"; + g->S[1] = p->mode == m_forward ? "0" : "z->lb"; + if (g->options->encoding == ENC_UTF8) { + w(g, "~{int ret = skip_utf8(z->p, z->c, ~S1, z->l, ~S0 "); + generate_AE(g, p->AE); writef(g, ");~C", p); + writef(g, "~Mif (ret < 0) ~f~N", p); + } else { + w(g, "~{int ret = z->c ~S0 "); + generate_AE(g, p->AE); writef(g, ";~C", p); + writef(g, "~Mif (~S1 > ret || ret > z->l) ~f~N", p); + } + writef(g, "~Mz->c = ret;~N" + "~}", p); +} + +static void generate_delete(struct generator * g, struct node * p) { + writef(g, "~{int ret = slice_del(z);~C", p); + writef(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +static void generate_tolimit(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "" : "b"; + writef(g, "~Mz->c = z->l~S0;~C", p); +} + +static void generate_atlimit(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "" : "b"; + g->S[1] = p->mode == m_forward ? "<" : ">"; + writef(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p); +} + +static void generate_leftslice(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "bra" : "ket"; + writef(g, "~Mz->~S0 = z->c;~C", p); +} + +static void generate_rightslice(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "ket" : "bra"; + writef(g, "~Mz->~S0 = z->c;~C", p); +} + +static void generate_assignto(struct generator * g, struct node * p) { + g->V[0] = p->name; + writef(g, "~M~V0 = assign_to(z, ~V0);~C" + "~Mif (~V0 == 0) return -1;~C", p); +} + +static void generate_sliceto(struct generator * g, struct node * p) { + g->V[0] = p->name; + writef(g, "~M~V0 = slice_to(z, ~V0);~C" + "~Mif (~V0 == 0) return -1;~N", p); +} + +static void generate_insert(struct generator * g, struct node * p, int style) { + + int keep_c = style == c_attach; + if (p->mode == m_backward) keep_c = !keep_c; + writef(g, "~{int ret;~N", p); + if (keep_c) w(g, "~{int saved_c = z->c;~N"); + writef(g, "~Mret = insert_~$(z, z->c, z->c, ~a);~C", p); + if (keep_c) w(g, "~Mz->c = saved_c;~N~}"); + writef(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +static void generate_assignfrom(struct generator * g, struct node * p) { + + int keep_c = p->mode == m_forward; /* like 'attach' */ + writef(g, "~{int ret;~N", p); + if (keep_c) writef(g, "~{int saved_c = z->c;~N", p); + w(g, "~Mret = "); + writef(g, keep_c ? "insert_~$(z, z->c, z->l, ~a);~C" : "insert_~$(z, z->lb, z->c, ~a);~C", p); + if (keep_c) w(g, "~Mz->c = saved_c;~N~}"); + writef(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +/* bugs marked <======= fixed 22/7/02. Similar fixes required for Java */ + +static void generate_slicefrom(struct generator * g, struct node * p) { + +/* w(g, "~Mslice_from_s(z, "); <============= bug! should be: */ + writef(g, "~{int ret = slice_from_~$(z, ~a);~C", p); + writef(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +static void generate_setlimit(struct generator * g, struct node * p) { + int keep_c; + if (p->left && p->left->type == c_tomark) { + /* Special case for: + * + * setlimit tomark AE for C + * + * All uses of setlimit in the current stemmers we ship follow this + * pattern, and by special-casing we can avoid having to save and + * restore c. + */ + struct node * q = p->left; + + ++g->keep_count; + writef(g, "~N~{int mlimit", p); + write_int(g, g->keep_count); + writef(g, ";~C", p); + keep_c = g->keep_count; + + g->S[0] = q->mode == m_forward ? ">" : "<"; + + w(g, "~Mif (z->c ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q); + w(g, "~Mmlimit"); + write_int(g, keep_c); + if (p->mode == m_forward) { + w(g, " = z->l - z->c; z->l = "); + } else { + w(g, " = z->lb; z->lb = "); + } + generate_AE(g, q->AE); + w(g, ";~N"); + } else { + writef(g, "~{~K~C", p); + keep_c = g->keep_count; + generate(g, p->left); + + w(g, "~Mmlimit"); + write_int(g, keep_c); + if (p->mode == m_forward) + w(g, " = z->l - z->c; z->l = z->c;~N"); + else + w(g, " = z->lb; z->lb = z->c;~N"); + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + + g->failure_keep_count = -keep_c; + generate(g, p->aux); + w(g, "~M"); + wrestorelimit(g, p, -g->failure_keep_count); + w(g, "~N" + "~}"); +} + +/* dollar sets snowball up to operate on a string variable as if it were the + * current string */ +static void generate_dollar(struct generator * g, struct node * p) { + + int used = g->label_used; + int a0 = g->failure_label; + int a1 = g->failure_keep_count; + int keep_token; + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_keep_count = 0; + + keep_token = ++g->keep_count; + g->I[0] = keep_token; + writef(g, "~{struct SN_env env~I0 = * z;~C", p); + g->V[0] = p->name; + /* Assume failure. */ + writef(g, "~Mint failure = 1;~N" + "~Mz->p = ~V0;~N" + "~Mz->lb = z->c = 0;~N" + "~Mz->l = SIZE(z->p);~N", p); + generate(g, p->left); + /* Mark success. */ + w(g, "~Mfailure = 0;~N"); + if (g->label_used) + wsetl(g, g->failure_label); + g->V[0] = p->name; /* necessary */ + + g->label_used = used; + g->failure_label = a0; + g->failure_keep_count = a1; + + g->I[0] = keep_token; + writef(g, "~M~V0 = z->p;~N" + "~M* z = env~I0;~N" + "~Mif (failure) ~f~N~}", p); +} + +static void generate_integer_assign(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); writef(g, ";~C", p); +} + +static void generate_integer_test(struct generator * g, struct node * p, char * s) { + + w(g, "~Mif (!("); + generate_AE(g, p->left); + write_char(g, ' '); + write_string(g, s); + write_char(g, ' '); + generate_AE(g, p->AE); + writef(g, ")) ~f~C", p); +} + +static void generate_call(struct generator * g, struct node * p) { + + g->V[0] = p->name; + writef(g, "~{int ret = ~V0(z);~C", p); + if (g->failure_keep_count == 0 && g->failure_label == x_return) { + /* Combine the two tests in this special case for better optimisation + * and clearer generated code. */ + writef(g, "~Mif (ret <= 0) return ret;~N~}", p); + } else { + writef(g, "~Mif (ret == 0) ~f~N" + "~Mif (ret < 0) return ret;~N~}", p); + } +} + +static void generate_grouping(struct generator * g, struct node * p, int complement) { + + struct grouping * q = p->name->grouping; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->S[1] = complement ? "out" : "in"; + g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; + g->V[0] = p->name; + g->I[0] = q->smallest_ch; + g->I[1] = q->largest_ch; + writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p); +} + +static void generate_namedstring(struct generator * g, struct node * p) { + + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->V[0] = p->name; + writef(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p); +} + +static void generate_literalstring(struct generator * g, struct node * p) { + symbol * b = p->literalstring; + if (SIZE(b) == 1) { + /* It's quite common to compare with a single character literal string, + * so just inline the simpler code for this case rather than making a + * function call. In UTF-8 mode, only do this for the ASCII subset, + * since multi-byte characters are more complex to test against. + */ + if (g->options->encoding == ENC_UTF8 && *b >= 128) { + printf("single byte %d\n", *b); + exit(1); + } + g->I[0] = *b; + if (p->mode == m_forward) { + writef(g, "~Mif (z->c == z->l || z->p[z->c] != ~c0) ~f~C" + "~Mz->c++;~N", p); + } else { + writef(g, "~Mif (z->c <= z->lb || z->p[z->c - 1] != ~c0) ~f~C" + "~Mz->c--;~N", p); + } + } else { + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = SIZE(b); + g->L[0] = b; + + writef(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p); + } +} + +static void generate_define(struct generator * g, struct node * p) { + struct name * q = p->name; + g->next_label = 0; + + g->S[0] = q->type == t_routine ? "static" : "extern"; + g->V[0] = q; + + w(g, "~N~S0 int ~V0(struct SN_env * z) {"); + if (g->options->comments) { + write_string(g, p->mode == m_forward ? " /* forwardmode */" : " /* backwardmode */"); + } + w(g, "~N~+"); + if (p->amongvar_needed) w(g, "~Mint among_var;~N"); + g->failure_keep_count = 0; + g->failure_label = x_return; + g->label_used = 0; + g->keep_count = 0; + generate(g, p->left); + w(g, "~Mreturn 1;~N~}"); +} + +static void generate_substring(struct generator * g, struct node * p) { + + struct among * x = p->among; + int block = -1; + unsigned int bitmap = 0; + struct amongvec * among_cases = x->b; + int c; + int empty_case = -1; + int n_cases = 0; + symbol cases[2]; + int shortest_size = INT_MAX; + int shown_comment = 0; + + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + /* In forward mode with non-ASCII UTF-8 characters, the first character + * of the string will often be the same, so instead look at the last + * common character position. + * + * In backward mode, we can't match if there are fewer characters before + * the current position than the minimum length. + */ + for (c = 0; c < x->literalstring_count; ++c) { + int size = among_cases[c].size; + if (size != 0 && size < shortest_size) { + shortest_size = size; + } + } + + for (c = 0; c < x->literalstring_count; ++c) { + symbol ch; + if (among_cases[c].size == 0) { + empty_case = c; + continue; + } + if (p->mode == m_forward) { + ch = among_cases[c].b[shortest_size - 1]; + } else { + ch = among_cases[c].b[among_cases[c].size - 1]; + } + if (n_cases == 0) { + block = ch >> 5; + } else if (ch >> 5 != block) { + block = -1; + if (n_cases > 2) break; + } + if (block == -1) { + if (n_cases > 0 && ch == cases[0]) continue; + if (n_cases < 2) { + cases[n_cases++] = ch; + } else if (ch != cases[1]) { + ++n_cases; + break; + } + } else { + if ((bitmap & (1u << (ch & 0x1f))) == 0) { + bitmap |= 1u << (ch & 0x1f); + if (n_cases < 2) + cases[n_cases] = ch; + ++n_cases; + } + } + } + + if (block != -1 || n_cases <= 2) { + char buf[64]; + g->I[2] = block; + g->I[3] = bitmap; + g->I[4] = shortest_size - 1; + if (p->mode == m_forward) { + sprintf(buf, "z->p[z->c + %d]", shortest_size - 1); + g->S[1] = buf; + if (shortest_size == 1) { + writef(g, "~Mif (z->c >= z->l", p); + } else { + writef(g, "~Mif (z->c + ~I4 >= z->l", p); + } + } else { + g->S[1] = "z->p[z->c - 1]"; + if (shortest_size == 1) { + writef(g, "~Mif (z->c <= z->lb", p); + } else { + writef(g, "~Mif (z->c - ~I4 <= z->lb", p); + } + } + if (n_cases == 0) { + /* We get this for the degenerate case: among { '' } + * This doesn't seem to be a useful construct, but it is + * syntactically valid. + */ + } else if (n_cases == 1) { + g->I[4] = cases[0]; + writef(g, " || ~S1 != ~I4", p); + } else if (n_cases == 2) { + g->I[4] = cases[0]; + g->I[5] = cases[1]; + writef(g, " || (~S1 != ~I4 && ~S1 != ~I5)", p); + } else { + writef(g, " || ~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); + } + write_string(g, ") "); + if (empty_case != -1) { + /* If the among includes the empty string, it can never fail + * so not matching the bitmap means we match the empty string. + */ + g->I[4] = among_cases[empty_case].result; + writef(g, "among_var = ~I4; else~C", p); + } else { + writef(g, "~f~C", p); + } + shown_comment = 1; + } else { +#ifdef OPTIMISATION_WARNINGS + printf("Couldn't shortcut among %d\n", x->number); +#endif + } + + if (!x->amongvar_needed) { + writef(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f", p); + writef(g, shown_comment ? "~N" : "~C", p); + } else { + writef(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);", p); + writef(g, shown_comment ? "~N" : "~C", p); + writef(g, "~Mif (!(among_var)) ~f~N", p); + } +} + +static void generate_among(struct generator * g, struct node * p) { + + struct among * x = p->among; + + if (x->substring == 0) generate_substring(g, p); + + if (x->starter != 0) generate(g, x->starter); + + if (x->command_count == 1 && x->nocommand_count == 0) { + /* Only one outcome ("no match" already handled). */ + generate(g, x->commands[0]); + } else if (x->command_count > 0) { + int i; + writef(g, "~Mswitch (among_var) {~C~+", p); + for (i = 1; i <= x->command_count; i++) { + g->I[0] = i; + w(g, "~Mcase ~I0:~N~+"); + generate(g, x->commands[i - 1]); + w(g, "~Mbreak;~N~-"); + } + w(g, "~}"); + } +} + +static void generate_booltest(struct generator * g, struct node * p) { + + g->V[0] = p->name; + writef(g, "~Mif (!(~V0)) ~f~C", p); +} + +static void generate_false(struct generator * g, struct node * p) { + + writef(g, "~M~f~C", p); +} + +static void generate_debug(struct generator * g, struct node * p) { + + g->I[0] = g->debug_count++; + g->I[1] = p->line_number; + writef(g, "~Mdebug(z, ~I0, ~I1);~C", p); + +} + +static void generate(struct generator * g, struct node * p) { + + int used = g->label_used; + int a0 = g->failure_label; + int a1 = g->failure_keep_count; + + switch (p->type) { + case c_define: generate_define(g, p); break; + case c_bra: generate_bra(g, p); break; + case c_and: generate_and(g, p); break; + case c_or: generate_or(g, p); break; + case c_backwards: generate_backwards(g, p); break; + case c_not: generate_not(g, p); break; + case c_set: generate_set(g, p); break; + case c_unset: generate_unset(g, p); break; + case c_try: generate_try(g, p); break; + case c_fail: generate_fail(g, p); break; + case c_reverse: + case c_test: generate_test(g, p); break; + case c_do: generate_do(g, p); break; + case c_goto: generate_GO(g, p, 1); break; + case c_gopast: generate_GO(g, p, 0); break; + case c_repeat: generate_repeat(g, p); break; + case c_loop: generate_loop(g, p); break; + case c_atleast: generate_atleast(g, p); break; + case c_setmark: generate_setmark(g, p); break; + case c_tomark: generate_tomark(g, p); break; + case c_atmark: generate_atmark(g, p); break; + case c_hop: generate_hop(g, p); break; + case c_delete: generate_delete(g, p); break; + case c_next: generate_next(g, p); break; + case c_tolimit: generate_tolimit(g, p); break; + case c_atlimit: generate_atlimit(g, p); break; + case c_leftslice: generate_leftslice(g, p); break; + case c_rightslice: generate_rightslice(g, p); break; + case c_assignto: generate_assignto(g, p); break; + case c_sliceto: generate_sliceto(g, p); break; + case c_assign: generate_assignfrom(g, p); break; + case c_insert: + case c_attach: generate_insert(g, p, p->type); break; + case c_slicefrom: generate_slicefrom(g, p); break; + case c_setlimit: generate_setlimit(g, p); break; + case c_dollar: generate_dollar(g, p); break; + case c_mathassign: generate_integer_assign(g, p, "="); break; + case c_plusassign: generate_integer_assign(g, p, "+="); break; + case c_minusassign: generate_integer_assign(g, p, "-="); break; + case c_multiplyassign:generate_integer_assign(g, p, "*="); break; + case c_divideassign: generate_integer_assign(g, p, "/="); break; + case c_eq: generate_integer_test(g, p, "=="); break; + case c_ne: generate_integer_test(g, p, "!="); break; + case c_gr: generate_integer_test(g, p, ">"); break; + case c_ge: generate_integer_test(g, p, ">="); break; + case c_ls: generate_integer_test(g, p, "<"); break; + case c_le: generate_integer_test(g, p, "<="); break; + case c_call: generate_call(g, p); break; + case c_grouping: generate_grouping(g, p, false); break; + case c_non: generate_grouping(g, p, true); break; + case c_name: generate_namedstring(g, p); break; + case c_literalstring: generate_literalstring(g, p); break; + case c_among: generate_among(g, p); break; + case c_substring: generate_substring(g, p); break; + case c_booltest: generate_booltest(g, p); break; + case c_false: generate_false(g, p); break; + case c_true: break; + case c_debug: generate_debug(g, p); break; + default: fprintf(stderr, "%d encountered\n", p->type); + exit(1); + } + + if (g->failure_label != a0) + g->label_used = used; + g->failure_label = a0; + g->failure_keep_count = a1; +} + +void write_generated_comment_content(struct generator * g) { + w(g, "Generated by Snowball " SNOWBALL_VERSION + " - https://snowballstem.org/"); +} + +void write_start_comment(struct generator * g, + const char * comment_start, + const char * comment_end) { + write_margin(g); + w(g, comment_start); + write_generated_comment_content(g); + if (comment_end) { + w(g, comment_end); + } + w(g, "~N~N"); +} + +static void generate_head(struct generator * g) { + + w(g, "#include \""); + if (g->options->runtime_path) { + write_string(g, g->options->runtime_path); + if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/') + write_char(g, '/'); + } + w(g, "header.h\"~N~N"); +} + +static void generate_routine_headers(struct generator * g) { + struct name * q; + for (q = g->analyser->names; q; q = q->next) { + g->V[0] = q; + switch (q->type) { + case t_routine: + w(g, "static int ~W0(struct SN_env * z);~N"); + break; + case t_external: + w(g, + "#ifdef __cplusplus~N" + "extern \"C\" {~N" + "#endif~N" + "extern int ~W0(struct SN_env * z);~N" + "#ifdef __cplusplus~N" + "}~N" + "#endif~N" + ); + break; + } + } +} + +static void generate_among_table(struct generator * g, struct among * x) { + + struct amongvec * v = x->b; + + g->I[0] = x->number; + { + int i; + for (i = 0; i < x->literalstring_count; i++) { + g->I[1] = i; + g->I[2] = v->size; + g->L[0] = v->b; + if (v->size) + w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N"); + v++; + } + } + + g->I[1] = x->literalstring_count; + w(g, "~N~Mstatic const struct among a_~I0[~I1] =~N{~N"); + + v = x->b; + { + int i; + for (i = 0; i < x->literalstring_count; i++) { + g->I[1] = i; + g->I[2] = v->size; + g->I[3] = v->i; + g->I[4] = v->result; + g->S[0] = i < x->literalstring_count - 1 ? "," : ""; + + if (g->options->comments) { + w(g, "/*~J1 */ "); + } + w(g, "{ ~I2, "); + if (v->size == 0) { + w(g, "0,"); + } else { + w(g, "s_~I0_~I1,"); + } + w(g, " ~I3, ~I4, "); + if (v->function == 0) { + write_char(g, '0'); + } else { + write_varname(g, v->function); + } + w(g, "}~S0~N"); + v++; + } + } + w(g, "};~N~N"); +} + +static void generate_amongs(struct generator * g) { + struct among * x; + for (x = g->analyser->amongs; x; x = x->next) { + generate_among_table(g, x); + } +} + +static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } + +static void generate_grouping_table(struct generator * g, struct grouping * q) { + + int range = q->largest_ch - q->smallest_ch + 1; + int size = (range + 7)/ 8; /* assume 8 bits per symbol */ + symbol * b = q->b; + symbol * map = create_b(size); + int i; + for (i = 0; i < size; i++) map[i] = 0; + + for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); + + g->V[0] = q->name; + + w(g, "static const unsigned char ~V0[] = { "); + for (i = 0; i < size; i++) { + write_int(g, map[i]); + if (i < size - 1) w(g, ", "); + } + w(g, " };~N~N"); + lose_b(map); +} + +static void generate_groupings(struct generator * g) { + struct grouping * q; + for (q = g->analyser->groupings; q; q = q->next) { + if (q->name->used) + generate_grouping_table(g, q); + } +} + +static void generate_create(struct generator * g) { + + int * p = g->analyser->name_count; + g->I[0] = p[t_string]; + g->I[1] = p[t_integer] + p[t_boolean]; + w(g, "~N" + "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1); }" + "~N"); +} + +static void generate_close(struct generator * g) { + + int * p = g->analyser->name_count; + g->I[0] = p[t_string]; + w(g, "~Nextern void ~pclose_env(struct SN_env * z) { SN_close_env(z, ~I0); }~N~N"); +} + +static void generate_create_and_close_templates(struct generator * g) { + w(g, "~N" + "extern struct SN_env * ~pcreate_env(void);~N" + "extern void ~pclose_env(struct SN_env * z);~N" + "~N"); +} + +static void generate_header_file(struct generator * g) { + + struct name * q; + const char * vp = g->options->variables_prefix; + g->S[0] = vp; + + w(g, "#ifdef __cplusplus~N" + "extern \"C\" {~N" + "#endif~N"); /* for C++ */ + + generate_create_and_close_templates(g); + for (q = g->analyser->names; q; q = q->next) { + g->V[0] = q; + switch (q->type) { + case t_external: + w(g, "extern int ~W0(struct SN_env * z);~N"); + break; + case t_string: + case t_integer: + case t_boolean: + if (vp) { + int count = q->count; + if (count < 0) { + /* Unused variables should get removed from `names`. */ + fprintf(stderr, "Optimised out variable "); + report_b(stderr, q->b); + fprintf(stderr, " still in names list\n"); + exit(1); + } + if (q->type == t_boolean) { + /* We use a single array for booleans and integers, + * with the integers first. + */ + count += g->analyser->name_count[t_integer]; + } + g->I[0] = count; + g->I[1] = "SIIrxg"[q->type]; + w(g, "#define ~S0"); + write_b(g, q->b); + w(g, " (~c1[~I0])~N"); + } + break; + } + } + + w(g, "~N" + "#ifdef __cplusplus~N" + "}~N" + "#endif~N"); /* for C++ */ + + w(g, "~N"); +} + +extern void generate_program_c(struct generator * g) { + + g->outbuf = str_new(); + write_start_comment(g, "/* ", " */"); + generate_head(g); + generate_routine_headers(g); + w(g, "#ifdef __cplusplus~N" + "extern \"C\" {~N" + "#endif~N" + "~N"); + generate_create_and_close_templates(g); + w(g, "~N" + "#ifdef __cplusplus~N" + "}~N" + "#endif~N"); + generate_amongs(g); + generate_groupings(g); + g->declarations = g->outbuf; + g->outbuf = str_new(); + g->literalstring_count = 0; + { + struct node * p = g->analyser->program; + while (p) { generate(g, p); p = p->right; } + } + generate_create(g); + generate_close(g); + output_str(g->options->output_src, g->declarations); + str_delete(g->declarations); + output_str(g->options->output_src, g->outbuf); + str_clear(g->outbuf); + + write_start_comment(g, "/* ", " */"); + generate_header_file(g); + output_str(g->options->output_h, g->outbuf); + str_delete(g->outbuf); +} + +/* Generator functions common to multiple languages. */ + +extern struct generator * create_generator(struct analyser * a, struct options * o) { + NEW(generator, g); + g->analyser = a; + g->options = o; + g->margin = 0; + g->debug_count = 0; + g->copy_from_count = 0; + g->line_count = 0; + g->line_labelled = 0; + g->failure_label = -1; + g->unreachable = false; +#ifndef DISABLE_PYTHON + g->max_label = 0; +#endif + return g; +} + +extern void close_generator(struct generator * g) { + FREE(g); +} + +/* Write routines for simple entities */ + +extern void write_char(struct generator * g, int ch) { + str_append_ch(g->outbuf, ch); /* character */ +} + +extern void write_newline(struct generator * g) { + str_append_ch(g->outbuf, '\n'); /* newline */ + g->line_count++; +} + +extern void write_string(struct generator * g, const char * s) { + str_append_string(g->outbuf, s); +} + +extern void write_int(struct generator * g, int i) { + str_append_int(g->outbuf, i); +} + +extern void write_b(struct generator * g, symbol * b) { + + str_append_b(g->outbuf, b); +} + +extern void write_str(struct generator * g, struct str * str) { + + str_append(g->outbuf, str); +} diff --git a/contrib/snowball/compiler/header.h b/contrib/snowball/compiler/header.h new file mode 100644 index 0000000..dadbee3 --- /dev/null +++ b/contrib/snowball/compiler/header.h @@ -0,0 +1,411 @@ +#include <stdio.h> + +#define SNOWBALL_VERSION "2.0.0" + +typedef unsigned char byte; +typedef unsigned short symbol; + +#define true 1 +#define false 0 + +#define MALLOC check_malloc +#define FREE check_free + +#define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) +#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n)) + +#define SIZE(p) ((int *)(p))[-1] +#define CAPACITY(p) ((int *)(p))[-2] + +extern symbol * create_b(int n); +extern void report_b(FILE * out, const symbol * p); +extern void lose_b(symbol * p); +extern symbol * increase_capacity(symbol * p, int n); +extern symbol * move_to_b(symbol * p, int n, const symbol * q); +extern symbol * add_to_b(symbol * p, int n, const symbol * q); +extern symbol * copy_b(const symbol * p); +extern char * b_to_s(const symbol * p); +extern symbol * add_s_to_b(symbol * p, const char * s); + +#define MOVE_TO_B(B, LIT) \ + move_to_b(B, sizeof(LIT) / sizeof(LIT[0]), LIT) + +struct str; /* defined in space.c */ + +extern struct str * str_new(void); +extern void str_delete(struct str * str); +extern void str_append(struct str * str, const struct str * add); +extern void str_append_ch(struct str * str, char add); +extern void str_append_b(struct str * str, const symbol * q); +extern void str_append_b_tail(struct str * str, const symbol * q, int skip); +extern void str_append_string(struct str * str, const char * s); +extern void str_append_int(struct str * str, int i); +extern void str_clear(struct str * str); +extern void str_assign(struct str * str, const char * s); +extern struct str * str_copy(const struct str * old); +extern symbol * str_data(const struct str * str); +extern int str_len(const struct str * str); +extern int str_back(const struct str *str); +extern int get_utf8(const symbol * p, int * slot); +extern int put_utf8(int ch, symbol * p); +extern void output_str(FILE * outfile, struct str * str); + +typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc; + +struct m_pair { + + struct m_pair * next; + symbol * name; + symbol * value; + +}; + +/* struct input must be a prefix of struct tokeniser. */ +struct input { + + struct input * next; + symbol * p; + int c; + char * file; + int file_needs_freeing; + int line_number; + +}; + +struct include { + + struct include * next; + symbol * b; + +}; + +enum token_codes { + +#include "syswords2.h" + + c_mathassign, + c_name, + c_number, + c_literalstring, + c_neg, + c_call, + c_grouping, + c_booltest, + + NUM_TOKEN_CODES +}; + +enum uplus_modes { + UPLUS_NONE, + UPLUS_DEFINED, + UPLUS_UNICODE +}; + +/* struct input must be a prefix of struct tokeniser. */ +struct tokeniser { + + struct input * next; + symbol * p; + int c; + char * file; + int file_needs_freeing; + int line_number; + symbol * b; + symbol * b2; + int number; + int m_start; + int m_end; + struct m_pair * m_pairs; + int get_depth; + int error_count; + int token; + int previous_token; + byte token_held; + enc encoding; + + int omission; + struct include * includes; + + /* Mode in which U+ has been used: + * UPLUS_NONE - not used yet + * UPLUS_DEFINED - stringdef U+xxxx .... + * UPLUS_UNICODE - {U+xxxx} used with implicit meaning + */ + int uplusmode; + + char token_disabled[NUM_TOKEN_CODES]; +}; + +extern symbol * get_input(const char * filename); +extern struct tokeniser * create_tokeniser(symbol * b, char * file); +extern int read_token(struct tokeniser * t); +extern const char * name_of_token(int code); +extern void disable_token(struct tokeniser * t, int code); +extern void close_tokeniser(struct tokeniser * t); + +extern int space_count; +extern void * check_malloc(int n); +extern void check_free(void * p); + +struct node; + +struct name { + + struct name * next; + symbol * b; + int type; /* t_string etc */ + int mode; /* )_ for routines, externals */ + struct node * definition; /* ) */ + int count; /* 0, 1, 2 for each type */ + struct grouping * grouping; /* for grouping names */ + byte referenced; + byte used_in_among; /* Function used in among? */ + byte value_used; /* (For variables) is its value ever used? */ + byte initialised; /* (For variables) is it ever initialised? */ + byte used_in_definition; /* (grouping) used in grouping definition? */ + struct node * used; /* First use, or NULL if not used */ + struct name * local_to; /* Local to one routine/external */ + int declaration_line_number;/* Line number of declaration */ + +}; + +struct literalstring { + + struct literalstring * next; + symbol * b; + +}; + +struct amongvec { + + symbol * b; /* the string giving the case */ + int size; /* - and its size */ + struct node * action; /* the corresponding action */ + int i; /* the amongvec index of the longest substring of b */ + int result; /* the numeric result for the case */ + int line_number; /* for diagnostics and stable sorting */ + struct name * function; + +}; + +struct among { + + struct among * next; + struct amongvec * b; /* pointer to the amongvec */ + int number; /* amongs are numbered 0, 1, 2 ... */ + int literalstring_count; /* in this among */ + int command_count; /* in this among */ + int nocommand_count; /* number of "no command" entries in this among */ + int function_count; /* in this among */ + int amongvar_needed; /* do we need to set among_var? */ + struct node * starter; /* i.e. among( (starter) 'string' ... ) */ + struct node * substring; /* i.e. substring ... among ( ... ) */ + struct node ** commands; /* array with command_count entries */ +}; + +struct grouping { + + struct grouping * next; + symbol * b; /* the characters of this group */ + int largest_ch; /* character with max code */ + int smallest_ch; /* character with min code */ + struct name * name; /* so g->name->grouping == g */ + int line_number; +}; + +struct node { + + struct node * next; + struct node * left; + struct node * aux; /* used in setlimit */ + struct among * among; /* used in among */ + struct node * right; + int type; + int mode; + struct node * AE; + struct name * name; + symbol * literalstring; + int number; + int line_number; + int amongvar_needed; /* used in routine definitions */ +}; + +enum name_types { + + t_size = 6, + + t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4, + t_grouping = 5 + +/* If this list is extended, adjust wvn in generator.c */ +}; + +/* In name_count[i] below, remember that + type is + ----+---- + 0 | string + 1 | boolean + 2 | integer + 3 | routine + 4 | external + 5 | grouping +*/ + +struct analyser { + + struct tokeniser * tokeniser; + struct node * nodes; + struct name * names; + struct literalstring * literalstrings; + int mode; + byte modifyable; /* false inside reverse(...) */ + struct node * program; + struct node * program_end; + int name_count[t_size]; /* name_count[i] counts the number of names of type i */ + struct among * amongs; + struct among * amongs_end; + int among_count; + int amongvar_needed; /* used in reading routine definitions */ + struct grouping * groupings; + struct grouping * groupings_end; + struct node * substring; /* pending 'substring' in current routine definition */ + enc encoding; + byte int_limits_used; /* are maxint or minint used? */ +}; + +enum analyser_modes { + + m_forward = 0, m_backward /*, m_integer */ + +}; + +extern void print_program(struct analyser * a); +extern struct analyser * create_analyser(struct tokeniser * t); +extern void close_analyser(struct analyser * a); + +extern void read_program(struct analyser * a); + +struct generator { + + struct analyser * analyser; + struct options * options; + int unreachable; /* 0 if code can be reached, 1 if current code + * is unreachable. */ + int var_number; /* Number of next variable to use. */ + struct str * outbuf; /* temporary str to store output */ + struct str * declarations; /* str storing variable declarations */ + int next_label; +#ifndef DISABLE_PYTHON + int max_label; +#endif + int margin; + + /* if > 0, keep_count to restore in case of a failure; + * if < 0, the negated keep_count for the limit to restore in case of + * failure. */ + int failure_keep_count; +#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP) + struct str * failure_str; /* This is used by some generators instead of failure_keep_count */ +#endif + + int label_used; /* Keep track of whether the failure label is used. */ + int failure_label; + int debug_count; + int copy_from_count; /* count of calls to copy_from() */ + + const char * S[10]; /* strings */ + symbol * B[10]; /* blocks */ + int I[10]; /* integers */ + struct name * V[5]; /* variables */ + symbol * L[5]; /* literals, used in formatted write */ + + int line_count; /* counts number of lines output */ + int line_labelled; /* in ISO C, will need extra ';' if it is a block end */ + int literalstring_count; + int keep_count; /* used to number keep/restore pairs to avoid compiler warnings + about shadowed variables */ +}; + +/* Special values for failure_label in struct generator. */ +enum special_labels { + x_return = -1 +}; + +struct options { + + /* for the command line: */ + + const char * output_file; + const char * name; + FILE * output_src; + FILE * output_h; + byte syntax_tree; + byte comments; + enc encoding; + enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang; + const char * externals_prefix; + const char * variables_prefix; + const char * runtime_path; + const char * parent_class_name; + const char * package; + const char * go_snowball_runtime; + const char * string_class; + const char * among_class; + struct include * includes; + struct include * includes_end; +}; + +/* Generator functions common to several backends. */ + +extern struct generator * create_generator(struct analyser * a, struct options * o); +extern void close_generator(struct generator * g); + +extern void write_char(struct generator * g, int ch); +extern void write_newline(struct generator * g); +extern void write_string(struct generator * g, const char * s); +extern void write_int(struct generator * g, int i); +extern void write_b(struct generator * g, symbol * b); +extern void write_str(struct generator * g, struct str * str); + +extern void write_comment_content(struct generator * g, struct node * p); +extern void write_generated_comment_content(struct generator * g); +extern void write_start_comment(struct generator * g, + const char * comment_start, + const char * comment_end); + +extern int K_needed(struct generator * g, struct node * p); +extern int repeat_restore(struct generator * g, struct node * p); + +/* Generator for C code. */ +extern void generate_program_c(struct generator * g); + +#ifndef DISABLE_JAVA +/* Generator for Java code. */ +extern void generate_program_java(struct generator * g); +#endif + +#ifndef DISABLE_CSHARP +/* Generator for C# code. */ +extern void generate_program_csharp(struct generator * g); +#endif + +#ifndef DISABLE_PASCAL +extern void generate_program_pascal(struct generator * g); +#endif + +#ifndef DISABLE_PYTHON +/* Generator for Python code. */ +extern void generate_program_python(struct generator * g); +#endif + +#ifndef DISABLE_JS +extern void generate_program_js(struct generator * g); +#endif + +#ifndef DISABLE_RUST +extern void generate_program_rust(struct generator * g); +#endif + +#ifndef DISABLE_GO +extern void generate_program_go(struct generator * g); +#endif diff --git a/contrib/snowball/compiler/space.c b/contrib/snowball/compiler/space.c new file mode 100644 index 0000000..5b05876 --- /dev/null +++ b/contrib/snowball/compiler/space.c @@ -0,0 +1,287 @@ + +#include <stdio.h> /* for printf */ +#include <stdlib.h> /* malloc, free */ +#include <string.h> /* memmove */ + +#include "header.h" + +#define HEAD 2*sizeof(int) +#define EXTENDER 40 + + +/* This modules provides a simple mechanism for arbitrary length writable + strings, called 'blocks'. They are 'symbol *' items rather than 'char *' + items however. + + The calls are: + + symbol * b = create_b(n); + - create an empty block b with room for n symbols + b = increase_capacity(b, n); + - increase the capacity of block b by n symbols (b may change) + b2 = copy_b(b) + - copy block b into b2 + lose_b(b); + - lose block b + b = move_to_b(b, n, p); + - set the data in b to be the n symbols at address p + b = add_to_b(b, n, p); + - add the n symbols at address p to the end of the data in b + SIZE(b) + - is the number of symbols in b + For example: + + symbol * b = create_b(0); + { int i; + char p[10]; + for (i = 0; i < 100; i++) { + sprintf(p, " %d", i); + add_s_to_b(b, p); + } + } + + and b contains " 0 1 2 ... 99" spaced out as symbols. +*/ + +/* For a block b, SIZE(b) is the number of symbols so far written into it, + CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b). + In fact blocks have 1 extra character over the promised capacity so + they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of + overwriting. +*/ + +extern symbol * create_b(int n) { + symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol))); + CAPACITY(p) = n; + SIZE(p) = 0; + return p; +} + +extern void report_b(FILE * out, const symbol * p) { + int i; + for (i = 0; i < SIZE(p); i++) { + if (p[i] > 255) { + printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); + exit(1); + } + putc(p[i], out); + } +} + +extern void output_str(FILE * outfile, struct str * str) { + report_b(outfile, str_data(str)); +} + +extern void lose_b(symbol * p) { + if (p == 0) return; + FREE((char *) p - HEAD); +} + +extern symbol * increase_capacity(symbol * p, int n) { + symbol * q = create_b(CAPACITY(p) + n + EXTENDER); + memmove(q, p, CAPACITY(p) * sizeof(symbol)); + SIZE(q) = SIZE(p); + lose_b(p); return q; +} + +extern symbol * move_to_b(symbol * p, int n, const symbol * q) { + int x = n - CAPACITY(p); + if (x > 0) p = increase_capacity(p, x); + memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p; +} + +extern symbol * add_to_b(symbol * p, int n, const symbol * q) { + int x = SIZE(p) + n - CAPACITY(p); + if (x > 0) p = increase_capacity(p, x); + memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; +} + +extern symbol * copy_b(const symbol * p) { + int n = SIZE(p); + symbol * q = create_b(n); + move_to_b(q, n, p); + return q; +} + +int space_count = 0; + +extern void * check_malloc(int n) { + space_count++; + return malloc(n); +} + +extern void check_free(void * p) { + space_count--; + free(p); +} + +/* To convert a block to a zero terminated string: */ + +extern char * b_to_s(const symbol * p) { + int n = SIZE(p); + char * s = (char *)malloc(n + 1); + { + int i; + for (i = 0; i < n; i++) { + if (p[i] > 255) { + printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); + exit(1); + } + s[i] = (char)p[i]; + } + } + s[n] = 0; + return s; +} + +/* To add a zero terminated string to a block. If p = 0 the + block is created. */ + +extern symbol * add_s_to_b(symbol * p, const char * s) { + int n = strlen(s); + int k; + if (p == 0) p = create_b(n); + k = SIZE(p); + { + int x = k + n - CAPACITY(p); + if (x > 0) p = increase_capacity(p, x); + } + { + int i; + for (i = 0; i < n; i++) p[i + k] = s[i]; + } + SIZE(p) += n; + return p; +} + +/* The next section defines string handling capabilities in terms + of the lower level block handling capabilities of space.c */ +/* -------------------------------------------------------------*/ + +struct str { + symbol * data; +}; + +/* Create a new string. */ +extern struct str * str_new(void) { + + struct str * output = (struct str *) malloc(sizeof(struct str)); + output->data = create_b(0); + return output; +} + +/* Delete a string. */ +extern void str_delete(struct str * str) { + + lose_b(str->data); + free(str); +} + +/* Append a str to this str. */ +extern void str_append(struct str * str, const struct str * add) { + + symbol * q = add->data; + str->data = add_to_b(str->data, SIZE(q), q); +} + +/* Append a character to this str. */ +extern void str_append_ch(struct str * str, char add) { + + symbol q[1]; + q[0] = add; + str->data = add_to_b(str->data, 1, q); +} + +/* Append a low level block to a str. */ +extern void str_append_b(struct str * str, const symbol * q) { + + str->data = add_to_b(str->data, SIZE(q), q); +} + +/* Append the tail of a low level block to a str. */ +extern void str_append_b_tail(struct str * str, const symbol * q, int skip) { + if (skip < 0 || skip >= SIZE(q)) return; + + str->data = add_to_b(str->data, SIZE(q) - skip, q + skip); +} + +/* Append a (char *, null terminated) string to a str. */ +extern void str_append_string(struct str * str, const char * s) { + + str->data = add_s_to_b(str->data, s); +} + +/* Append an integer to a str. */ +extern void str_append_int(struct str * str, int i) { + + char s[30]; + sprintf(s, "%d", i); + str_append_string(str, s); +} + +/* Clear a string */ +extern void str_clear(struct str * str) { + + SIZE(str->data) = 0; +} + +/* Set a string */ +extern void str_assign(struct str * str, const char * s) { + + str_clear(str); + str_append_string(str, s); +} + +/* Copy a string. */ +extern struct str * str_copy(const struct str * old) { + + struct str * newstr = str_new(); + str_append(newstr, old); + return newstr; +} + +/* Get the data stored in this str. */ +extern symbol * str_data(const struct str * str) { + + return str->data; +} + +/* Get the length of the str. */ +extern int str_len(const struct str * str) { + + return SIZE(str->data); +} + +/* Get the last character of the str. + * + * Or -1 if the string is empty. + */ +extern int str_back(const struct str *str) { + return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1; +} + +extern int get_utf8(const symbol * p, int * slot) { + int b0, b1; + b0 = *p++; + if (b0 < 0xC0) { /* 1100 0000 */ + * slot = b0; return 1; + } + b1 = *p++; + if (b0 < 0xE0) { /* 1110 0000 */ + * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; + } + * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3; +} + +extern int put_utf8(int ch, symbol * p) { + if (ch < 0x80) { + p[0] = ch; return 1; + } + if (ch < 0x800) { + p[0] = (ch >> 6) | 0xC0; + p[1] = (ch & 0x3F) | 0x80; return 2; + } + p[0] = (ch >> 12) | 0xE0; + p[1] = ((ch >> 6) & 0x3F) | 0x80; + p[2] = (ch & 0x3F) | 0x80; return 3; +} diff --git a/contrib/snowball/compiler/syswords.h b/contrib/snowball/compiler/syswords.h new file mode 100644 index 0000000..c401d3e --- /dev/null +++ b/contrib/snowball/compiler/syswords.h @@ -0,0 +1,86 @@ +static const struct system_word vocab[82+1] = { + { 0, (const byte *)"", 82+1}, + + { 1, (const byte *)"$", c_dollar }, + { 1, (const byte *)"(", c_bra }, + { 1, (const byte *)")", c_ket }, + { 1, (const byte *)"*", c_multiply }, + { 1, (const byte *)"+", c_plus }, + { 1, (const byte *)"-", c_minus }, + { 1, (const byte *)"/", c_divide }, + { 1, (const byte *)"<", c_ls }, + { 1, (const byte *)"=", c_assign }, + { 1, (const byte *)">", c_gr }, + { 1, (const byte *)"?", c_debug }, + { 1, (const byte *)"[", c_leftslice }, + { 1, (const byte *)"]", c_rightslice }, + { 2, (const byte *)"!=", c_ne }, + { 2, (const byte *)"*=", c_multiplyassign }, + { 2, (const byte *)"+=", c_plusassign }, + { 2, (const byte *)"-=", c_minusassign }, + { 2, (const byte *)"->", c_sliceto }, + { 2, (const byte *)"/*", c_comment2 }, + { 2, (const byte *)"//", c_comment1 }, + { 2, (const byte *)"/=", c_divideassign }, + { 2, (const byte *)"<+", c_insert }, + { 2, (const byte *)"<-", c_slicefrom }, + { 2, (const byte *)"<=", c_le }, + { 2, (const byte *)"==", c_eq }, + { 2, (const byte *)"=>", c_assignto }, + { 2, (const byte *)">=", c_ge }, + { 2, (const byte *)"as", c_as }, + { 2, (const byte *)"do", c_do }, + { 2, (const byte *)"or", c_or }, + { 3, (const byte *)"and", c_and }, + { 3, (const byte *)"for", c_for }, + { 3, (const byte *)"get", c_get }, + { 3, (const byte *)"hex", c_hex }, + { 3, (const byte *)"hop", c_hop }, + { 3, (const byte *)"len", c_len }, + { 3, (const byte *)"non", c_non }, + { 3, (const byte *)"not", c_not }, + { 3, (const byte *)"set", c_set }, + { 3, (const byte *)"try", c_try }, + { 4, (const byte *)"fail", c_fail }, + { 4, (const byte *)"goto", c_goto }, + { 4, (const byte *)"loop", c_loop }, + { 4, (const byte *)"next", c_next }, + { 4, (const byte *)"size", c_size }, + { 4, (const byte *)"test", c_test }, + { 4, (const byte *)"true", c_true }, + { 5, (const byte *)"among", c_among }, + { 5, (const byte *)"false", c_false }, + { 5, (const byte *)"lenof", c_lenof }, + { 5, (const byte *)"limit", c_limit }, + { 5, (const byte *)"unset", c_unset }, + { 6, (const byte *)"atmark", c_atmark }, + { 6, (const byte *)"attach", c_attach }, + { 6, (const byte *)"cursor", c_cursor }, + { 6, (const byte *)"define", c_define }, + { 6, (const byte *)"delete", c_delete }, + { 6, (const byte *)"gopast", c_gopast }, + { 6, (const byte *)"insert", c_insert }, + { 6, (const byte *)"maxint", c_maxint }, + { 6, (const byte *)"minint", c_minint }, + { 6, (const byte *)"repeat", c_repeat }, + { 6, (const byte *)"sizeof", c_sizeof }, + { 6, (const byte *)"tomark", c_tomark }, + { 7, (const byte *)"atleast", c_atleast }, + { 7, (const byte *)"atlimit", c_atlimit }, + { 7, (const byte *)"decimal", c_decimal }, + { 7, (const byte *)"reverse", c_reverse }, + { 7, (const byte *)"setmark", c_setmark }, + { 7, (const byte *)"strings", c_strings }, + { 7, (const byte *)"tolimit", c_tolimit }, + { 8, (const byte *)"booleans", c_booleans }, + { 8, (const byte *)"integers", c_integers }, + { 8, (const byte *)"routines", c_routines }, + { 8, (const byte *)"setlimit", c_setlimit }, + { 9, (const byte *)"backwards", c_backwards }, + { 9, (const byte *)"externals", c_externals }, + { 9, (const byte *)"groupings", c_groupings }, + { 9, (const byte *)"stringdef", c_stringdef }, + { 9, (const byte *)"substring", c_substring }, + { 12, (const byte *)"backwardmode", c_backwardmode }, + { 13, (const byte *)"stringescapes", c_stringescapes } +}; diff --git a/contrib/snowball/compiler/syswords2.h b/contrib/snowball/compiler/syswords2.h new file mode 100644 index 0000000..e853e32 --- /dev/null +++ b/contrib/snowball/compiler/syswords2.h @@ -0,0 +1,13 @@ + c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast, + c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards, + c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug, + c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, + c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get, + c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert, + c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop, + c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, + c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus, + c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, + c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, + c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring, + c_test, c_tolimit, c_tomark, c_true, c_try, c_unset, diff --git a/contrib/snowball/compiler/tokeniser.c b/contrib/snowball/compiler/tokeniser.c new file mode 100644 index 0000000..e6c6386 --- /dev/null +++ b/contrib/snowball/compiler/tokeniser.c @@ -0,0 +1,567 @@ + +#include <stdio.h> /* stderr etc */ +#include <stdlib.h> /* malloc free */ +#include <string.h> /* strlen */ +#include <ctype.h> /* isalpha etc */ +#include "header.h" + +struct system_word { + int s_size; /* size of system word */ + const byte * s; /* pointer to the system word */ + int code; /* its internal code */ +}; + + +/* ASCII collating assumed in syswords.c */ + +#include "syswords.h" + +#define INITIAL_INPUT_BUFFER_SIZE 8192 + +static int hex_to_num(int ch); + +static int smaller(int a, int b) { return a < b ? a : b; } + +extern symbol * get_input(const char * filename) { + FILE * input = fopen(filename, "r"); + if (input == 0) { return 0; } + { + symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE); + int size = 0; + while (true) { + int ch = getc(input); + if (ch == EOF) break; + if (size >= CAPACITY(u)) u = increase_capacity(u, size); + u[size++] = ch; + } + fclose(input); + SIZE(u) = size; + return u; + } +} + +static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) { + if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } + fprintf(stderr, "%s:%d: ", t->file, t->line_number); + if (s1) fprintf(stderr, "%s", s1); + if (p) { + int i; + for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); + } + if (s2) fprintf(stderr, "%s", s2); + fprintf(stderr, "\n"); + t->error_count++; +} + +static void error1(struct tokeniser * t, const char * s) { + error(t, s, 0,0, 0); +} + +static void error2(struct tokeniser * t, const char * s) { + error(t, "unexpected end of text after ", 0,0, s); +} + +static int compare_words(int m, symbol * p, int n, const byte * q) { + if (m != n) return m - n; + { + int i; for (i = 0; i < n; i++) { + int diff = p[i] - q[i]; + if (diff) return diff; + } + } + return 0; +} + +static int find_word(int n, symbol * p) { + int i = 0; int j = vocab->code; + do { + int k = i + (j - i)/2; + const struct system_word * w = vocab + k; + int diff = compare_words(n, p, w->s_size, w->s); + if (diff == 0) return w->code; + if (diff < 0) j = k; else i = k; + } while (j - i != 1); + return -1; +} + +static int get_number(int n, symbol * p) { + int x = 0; + int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0'; + return x; +} + +static int eq_s(struct tokeniser * t, const char * s) { + int l = strlen(s); + if (SIZE(t->p) - t->c < l) return false; + { + int i; + for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false; + } + t->c += l; return true; +} + +static int white_space(struct tokeniser * t, int ch) { + switch (ch) { + case '\n': + t->line_number++; + /* fall through */ + case '\r': + case '\t': + case ' ': + return true; + } + return false; +} + +static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) { + struct m_pair * q; + for (q = t->m_pairs; q; q = q->next) { + symbol * name = q->name; + if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; + } + return 0; +} + +static int read_literal_string(struct tokeniser * t, int c) { + symbol * p = t->p; + int ch; + SIZE(t->b) = 0; + while (true) { + if (c >= SIZE(p)) { error2(t, "'"); return c; } + ch = p[c]; + if (ch == '\n') { error1(t, "string not terminated"); return c; } + c++; + if (ch == t->m_start) { + /* Inside insert characters. */ + int c0 = c; + int newlines = false; /* no newlines as yet */ + int black_found = false; /* no printing chars as yet */ + while (true) { + if (c >= SIZE(p)) { error2(t, "'"); return c; } + ch = p[c]; c++; + if (ch == t->m_end) break; + if (!white_space(t, ch)) black_found = true; + if (ch == '\n') newlines = true; + if (newlines && black_found) { + error1(t, "string not terminated"); + return c; + } + } + if (!newlines) { + int n = c - c0 - 1; /* macro size */ + int firstch = p[c0]; + symbol * q = find_in_m(t, n, p + c0); + if (q == 0) { + if (n == 1 && (firstch == '\'' || firstch == t->m_start)) + t->b = add_to_b(t->b, 1, p + c0); + else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { + int codepoint = 0; + int x; + if (t->uplusmode == UPLUS_DEFINED) { + /* See if found with xxxx upper-cased. */ + symbol * uc = create_b(n); + int i; + for (i = 0; i != n; ++i) { + uc[i] = toupper(p[c0 + i]); + } + q = find_in_m(t, n, uc); + lose_b(uc); + if (q != 0) { + t->b = add_to_b(t->b, SIZE(q), q); + continue; + } + error1(t, "Some U+xxxx stringdefs seen but not this one"); + } else { + t->uplusmode = UPLUS_UNICODE; + } + for (x = c0 + 2; x != c - 1; ++x) { + int hex = hex_to_num(p[x]); + if (hex < 0) { + error1(t, "Bad hex digit following U+"); + break; + } + codepoint = (codepoint << 4) | hex; + } + if (t->encoding == ENC_UTF8) { + if (codepoint < 0 || codepoint > 0x01ffff) { + error1(t, "character values exceed 0x01ffff"); + } + /* Ensure there's enough space for a max length + * UTF-8 sequence. */ + if (CAPACITY(t->b) < SIZE(t->b) + 3) { + t->b = increase_capacity(t->b, 3); + } + SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b)); + } else { + symbol sym; + if (t->encoding == ENC_SINGLEBYTE) { + /* Only ISO-8859-1 is handled this way - for + * other single-byte character sets you need + * stringdef all the U+xxxx codes you use + * like - e.g.: + * + * stringdef U+0171 hex 'FB' + */ + if (codepoint < 0 || codepoint > 0xff) { + error1(t, "character values exceed 256"); + } + } else { + if (codepoint < 0 || codepoint > 0xffff) { + error1(t, "character values exceed 64K"); + } + } + sym = codepoint; + t->b = add_to_b(t->b, 1, &sym); + } + } else + error(t, "string macro '", n, p + c0, "' undeclared"); + } else + t->b = add_to_b(t->b, SIZE(q), q); + } + } else { + if (ch == '\'') return c; + if (ch < 0 || ch >= 0x80) { + if (t->encoding != ENC_WIDECHARS) { + /* We don't really want people using non-ASCII literal + * strings, but historically it's worked for single-byte + * and UTF-8 if the source encoding matches what the + * generated stemmer works in and it seems unfair to just + * suddenly make this a hard error.` + */ + fprintf(stderr, + "%s:%d: warning: Non-ASCII literal strings aren't " + "portable - use stringdef instead\n", + t->file, t->line_number); + } else { + error1(t, "Non-ASCII literal strings aren't " + "portable - use stringdef instead"); + } + } + t->b = add_to_b(t->b, 1, p + c - 1); + } + } +} + +static int next_token(struct tokeniser * t) { + symbol * p = t->p; + int c = t->c; + int ch; + int code = -1; + while (true) { + if (c >= SIZE(p)) { t->c = c; return -1; } + ch = p[c]; + if (white_space(t, ch)) { c++; continue; } + if (isalpha(ch)) { + int c0 = c; + while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; + code = find_word(c - c0, p + c0); + if (code < 0 || t->token_disabled[code]) { + t->b = move_to_b(t->b, c - c0, p + c0); + code = c_name; + } + } else + if (isdigit(ch)) { + int c0 = c; + while (c < SIZE(p) && isdigit(p[c])) c++; + t->number = get_number(c - c0, p + c0); + code = c_number; + } else + if (ch == '\'') { + c = read_literal_string(t, c + 1); + code = c_literalstring; + } else + { + int lim = smaller(2, SIZE(p) - c); + int i; + for (i = lim; i > 0; i--) { + code = find_word(i, p + c); + if (code >= 0) { c += i; break; } + } + } + if (code >= 0) { + t->c = c; + return code; + } + error(t, "'", 1, p + c, "' unknown"); + c++; + continue; + } +} + +static int next_char(struct tokeniser * t) { + if (t->c >= SIZE(t->p)) return -1; + return t->p[t->c++]; +} + +static int next_real_char(struct tokeniser * t) { + while (true) { + int ch = next_char(t); + if (!white_space(t, ch)) return ch; + } +} + +static void read_chars(struct tokeniser * t) { + int ch = next_real_char(t); + if (ch < 0) { error2(t, "stringdef"); return; } + { + int c0 = t->c-1; + while (true) { + ch = next_char(t); + if (white_space(t, ch) || ch < 0) break; + } + t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0); + } +} + +static int decimal_to_num(int ch) { + if ('0' <= ch && ch <= '9') return ch - '0'; + return -1; +} + +static int hex_to_num(int ch) { + if ('0' <= ch && ch <= '9') return ch - '0'; + if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; + if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; + return -1; +} + +static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { + int c = 0; int d = 0; + while (true) { + while (c < SIZE(p) && p[c] == ' ') c++; + if (c == SIZE(p)) break; + { + int number = 0; + while (c != SIZE(p)) { + int ch = p[c]; + if (ch == ' ') break; + if (base == 10) { + ch = decimal_to_num(ch); + if (ch < 0) { + error1(t, "decimal string contains non-digits"); + return; + } + } else { + ch = hex_to_num(ch); + if (ch < 0) { + error1(t, "hex string contains non-hex characters"); + return; + } + } + number = base * number + ch; + c++; + } + if (t->encoding == ENC_SINGLEBYTE) { + if (number < 0 || number > 0xff) { + error1(t, "character values exceed 256"); + return; + } + } else { + if (number < 0 || number > 0xffff) { + error1(t, "character values exceed 64K"); + return; + } + } + if (t->encoding == ENC_UTF8) + d += put_utf8(number, p + d); + else + p[d++] = number; + } + } + SIZE(p) = d; +} + +extern int read_token(struct tokeniser * t) { + symbol * p = t->p; + int held = t->token_held; + t->token_held = false; + if (held) return t->token; + while (true) { + int code = next_token(t); + switch (code) { + case c_comment1: /* slash-slash comment */ + while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; + continue; + case c_comment2: /* slash-star comment */ + while (true) { + if (t->c >= SIZE(p)) { + error1(t, "/* comment not terminated"); + t->token = -1; + return -1; + } + if (p[t->c] == '\n') t->line_number++; + if (eq_s(t, "*/")) break; + t->c++; + } + continue; + case c_stringescapes: { + int ch1 = next_real_char(t); + int ch2 = next_real_char(t); + if (ch2 < 0) { + error2(t, "stringescapes"); + continue; + } + if (ch1 == '\'') { + error1(t, "first stringescape cannot be '"); + continue; + } + t->m_start = ch1; + t->m_end = ch2; + continue; + } + case c_stringdef: { + int base = 0; + read_chars(t); + code = read_token(t); + if (code == c_hex) { base = 16; code = read_token(t); } else + if (code == c_decimal) { base = 10; code = read_token(t); } + if (code != c_literalstring) { + error1(t, "string omitted after stringdef"); + continue; + } + if (base > 0) convert_numeric_string(t, t->b, base); + { NEW(m_pair, q); + q->next = t->m_pairs; + q->name = copy_b(t->b2); + q->value = copy_b(t->b); + t->m_pairs = q; + if (t->uplusmode != UPLUS_DEFINED && + (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) { + if (t->uplusmode == UPLUS_UNICODE) { + error1(t, "U+xxxx already used with implicit meaning"); + } else { + t->uplusmode = UPLUS_DEFINED; + } + } + } + continue; + } + case c_get: + code = read_token(t); + if (code != c_literalstring) { + error1(t, "string omitted after get"); continue; + } + t->get_depth++; + if (t->get_depth > 10) { + fprintf(stderr, "get directives go 10 deep. Looping?\n"); + exit(1); + } + { + NEW(input, q); + char * file = b_to_s(t->b); + symbol * u = get_input(file); + if (u == 0) { + struct include * r; + for (r = t->includes; r; r = r->next) { + symbol * b = copy_b(r->b); + b = add_to_b(b, SIZE(t->b), t->b); + free(file); + file = b_to_s(b); + u = get_input(file); + lose_b(b); + if (u != 0) break; + } + } + if (u == 0) { + error(t, "Can't get '", SIZE(t->b), t->b, "'"); + exit(1); + } + memmove(q, t, sizeof(struct input)); + t->next = q; + t->p = u; + t->c = 0; + t->file = file; + t->file_needs_freeing = true; + t->line_number = 1; + } + p = t->p; + continue; + case -1: + if (t->next) { + lose_b(p); + { + struct input * q = t->next; + memmove(t, q, sizeof(struct input)); p = t->p; + FREE(q); + } + t->get_depth--; + continue; + } + /* fall through */ + default: + t->previous_token = t->token; + t->token = code; + return code; + } + } +} + +extern const char * name_of_token(int code) { + int i; + for (i = 1; i < vocab->code; i++) + if ((vocab + i)->code == code) return (const char *)(vocab + i)->s; + switch (code) { + case c_mathassign: return "="; + case c_name: return "name"; + case c_number: return "number"; + case c_literalstring:return "literal"; + case c_neg: return "neg"; + case c_grouping: return "grouping"; + case c_call: return "call"; + case c_booltest: return "Boolean test"; + case -2: return "start of text"; + case -1: return "end of text"; + default: return "?"; + } +} + +extern void disable_token(struct tokeniser * t, int code) { + t->token_disabled[code] = 1; +} + +extern struct tokeniser * create_tokeniser(symbol * p, char * file) { + NEW(tokeniser, t); + t->next = 0; + t->p = p; + t->c = 0; + t->file = file; + t->file_needs_freeing = false; + t->line_number = 1; + t->b = create_b(0); + t->b2 = create_b(0); + t->m_start = -1; + t->m_pairs = 0; + t->get_depth = 0; + t->error_count = 0; + t->token_held = false; + t->token = -2; + t->previous_token = -2; + t->uplusmode = UPLUS_NONE; + memset(t->token_disabled, 0, sizeof(t->token_disabled)); + return t; +} + +extern void close_tokeniser(struct tokeniser * t) { + lose_b(t->b); + lose_b(t->b2); + { + struct m_pair * q = t->m_pairs; + while (q) { + struct m_pair * q_next = q->next; + lose_b(q->name); + lose_b(q->value); + FREE(q); + q = q_next; + } + } + { + struct input * q = t->next; + while (q) { + struct input * q_next = q->next; + FREE(q); + q = q_next; + } + } + if (t->file_needs_freeing) free(t->file); + FREE(t); +} diff --git a/contrib/snowball/doc/TODO b/contrib/snowball/doc/TODO new file mode 100644 index 0000000..0cfa1b1 --- /dev/null +++ b/contrib/snowball/doc/TODO @@ -0,0 +1,15 @@ +Things to do: + + - Write documentation for how to use libstemmer (as opposed to how stemming + algorithms themselves work). + Currently, the documentation in the include/libstemmer.h header file is + pretty clear and comprehensive, but an overview document wouldn't go amiss. + +Things that would be nice to include at some point. + + - Add version numbers to each stemming algorithm, and allow the interface to + request a specific version of the stemming algorithms. Default to providing + the latest version of the algorithm. + - Make mkmodules.pl generate the build system, instead of being called from it. + This would allow it to generate the list of modules to be built, so that it's + not necessary to change things in more than one place to add a new algorithm. diff --git a/contrib/snowball/doc/libstemmer_c_README b/contrib/snowball/doc/libstemmer_c_README new file mode 100644 index 0000000..9d3af8b --- /dev/null +++ b/contrib/snowball/doc/libstemmer_c_README @@ -0,0 +1,125 @@ +libstemmer_c +============ + +This document pertains to the C version of the libstemmer distribution, +available for download from: + +http://snowball.tartarus.org/dist/libstemmer_c.tgz + + +Compiling the library +===================== + +A simple makefile is provided for Unix style systems. On such systems, it +should be possible simply to run "make", and the file "libstemmer.o" +and the example program "stemwords" will be generated. + +If this doesn't work on your system, you need to write your own build +system (or call the compiler directly). The files to compile are +all contained in the "libstemmer", "runtime" and "src_c" directories, +and the public header file is contained in the "include" directory. + +The library comes in two flavours; UTF-8 only, and UTF-8 plus other character +sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of +"libstemmer.c". + +For convenience "mkinc.mak" is a makefile fragment listing the source files and +header files used to compile the standard version of the library. +"mkinc_utf8.mak" is a comparable makefile fragment listing just the source +files for the UTF-8 only version of the library. + + +Using the library +================= + +The library provides a simple C API. Essentially, a new stemmer can +be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then +used to stem a word, "sb_stemmer_length" returns the stemmed +length of the last word processed, and "sb_stemmer_delete" is +used to delete a stemmer. + +Creating a stemmer is a relatively expensive operation - the expected +usage pattern is that a new stemmer is created when needed, used +to stem many words, and deleted after some time. + +Stemmers are re-entrant, but not threadsafe. In other words, if +you wish to access the same stemmer object from multiple threads, +you must ensure that all access is protected by a mutex or similar +device. + +libstemmer does not currently incorporate any mechanism for caching the results +of stemming operations. Such caching can greatly increase the performance of a +stemmer under certain situations, so suitable patches will be considered for +inclusion. + +The standard libstemmer sources contain an algorithm for each of the supported +languages. The algorithm may be selected using the english name of the +language, or using the 2 or 3 letter ISO 639 language codes. In addition, +the traditional "Porter" stemming algorithm for english is included for +backwards compatibility purposes, but we recommend use of the "English" +stemmer in preference for new projects. + +(Some minor algorithms which are included only as curiosities in the snowball +website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not +included in the standard libstemmer sources. These are not really supported by +the snowball project, but it would be possible to compile a modified libstemmer +library containing these if desired.) + + +The stemwords example +===================== + +The stemwords example program allows you to run any of the stemmers +compiled into the libstemmer library on a sample vocabulary. For +details on how to use it, run it with the "-h" command line option. + + +Using the library in a larger system +==================================== + +If you are incorporating the library into the build system of a larger +program, I recommend copying the unpacked tarball without modification into +a subdirectory of the sources of your program. Future versions of the +library are intended to keep the same structure, so this will keep the +work required to move to a new version of the library to a minimum. + +As an additional convenience, the list of source and header files used +in the library is detailed in mkinc.mak - a file which is in a suitable +format for inclusion by a Makefile. By including this file in your build +system, you can link the snowball system into your program with a few +extra rules. + +Using the library in a system using GNU autotools +================================================= + +The libstemmer_c library can be integrated into a larger system which uses the +GNU autotool framework (and in particular, automake and autoconf) as follows: + +1) Unpack libstemmer_c.tgz in the top level project directory so that there is + a libstemmer_c subdirectory of the top level directory of the project. + +2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: + +noinst_LTLIBRARIES = libstemmer.la +include $(srcdir)/mkinc.mak +noinst_HEADERS = $(snowball_headers) +libstemmer_la_SOURCES = $(snowball_sources) + +(You may also need to add other lines to this, for example, if you are using +compiler options which are not compatible with compiling the libstemmer +library.) + +3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's + configure.ac file. + +4) Add to the top level makefile the following lines (or modify existing + assignments to these variables appropriately): + +AUTOMAKE_OPTIONS = subdir-objects +AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include +SUBDIRS=libstemmer_c +<name>_LIBADD = libstemmer_c/libstemmer.la + +(Where <name> is the name of the library or executable which links against +libstemmer.) + diff --git a/contrib/snowball/doc/libstemmer_java_README b/contrib/snowball/doc/libstemmer_java_README new file mode 100644 index 0000000..38b1af6 --- /dev/null +++ b/contrib/snowball/doc/libstemmer_java_README @@ -0,0 +1,40 @@ +libstemmer_java +=============== + +This document pertains to the Java version of the libstemmer distribution, +available for download from: + +http://snowball.tartarus.org/dist/libstemmer_java.tgz + + +Compiling the library +===================== + +Simply run the java compiler on all the java source files under the java +directory. For example, this can be done under unix by changing directory into +the java directory, and running: + + javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java + +This will compile the library and also an example program "TestApp" which +provides a command line interface to the library. + + +Using the library +================= + +There is currently no formal documentation on the use of the Java version +of the library. Additionally, its interface is not guaranteed to be +stable. + +The best documentation of the library is the source of the TestApp example +program. + + +The TestApp example +=================== + +The TestApp example program allows you to run any of the stemmers +compiled into the libstemmer library on a sample vocabulary. For +details on how to use it, run it with no command line parameters. + diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h new file mode 100644 index 0000000..98051e1 --- /dev/null +++ b/contrib/snowball/include/libstemmer.h @@ -0,0 +1,78 @@ + +/* Make header file work when included from C++ */ +#ifdef __cplusplus +extern "C" { +#endif + +struct sb_stemmer; +typedef unsigned char sb_symbol; + +/* FIXME - should be able to get a version number for each stemming + * algorithm (which will be incremented each time the output changes). */ + +/** Returns an array of the names of the available stemming algorithms. + * Note that these are the canonical names - aliases (ie, other names for + * the same algorithm) will not be included in the list. + * The list is terminated with a null pointer. + * + * The list must not be modified in any way. + */ +const char ** sb_stemmer_list(void); + +/** Create a new stemmer object, using the specified algorithm, for the + * specified character encoding. + * + * All algorithms will usually be available in UTF-8, but may also be + * available in other character encodings. + * + * @param algorithm The algorithm name. This is either the english + * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the + * language. Note that case is significant in this parameter - the + * value should be supplied in lower case. + * + * @param charenc The character encoding. NULL may be passed as + * this value, in which case UTF-8 encoding will be assumed. Otherwise, + * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), + * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is + * significant in this parameter. + * + * @return NULL if the specified algorithm is not recognised, or the + * algorithm is not available for the requested encoding. Otherwise, + * returns a pointer to a newly created stemmer for the requested algorithm. + * The returned pointer must be deleted by calling sb_stemmer_delete(). + * + * @note NULL will also be returned if an out of memory error occurs. + */ +struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); + +/** Delete a stemmer object. + * + * This frees all resources allocated for the stemmer. After calling + * this function, the supplied stemmer may no longer be used in any way. + * + * It is safe to pass a null pointer to this function - this will have + * no effect. + */ +void sb_stemmer_delete(struct sb_stemmer * stemmer); + +/** Stem a word. + * + * The return value is owned by the stemmer - it must not be freed or + * modified, and it will become invalid when the stemmer is called again, + * or if the stemmer is freed. + * + * The length of the return value can be obtained using sb_stemmer_length(). + * + * If an out-of-memory error occurs, this will return NULL. + */ +const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, + const sb_symbol * word, int size); + +/** Get the length of the result of the last stemmed word. + * This should not be called before sb_stemmer_stem() has been called. + */ +int sb_stemmer_length(struct sb_stemmer * stemmer); + +#ifdef __cplusplus +} +#endif diff --git a/contrib/snowball/libstemmer/libstemmer_c.in b/contrib/snowball/libstemmer/libstemmer_c.in new file mode 100644 index 0000000..2aa918d --- /dev/null +++ b/contrib/snowball/libstemmer/libstemmer_c.in @@ -0,0 +1,96 @@ + +#include <stdlib.h> +#include <string.h> +#include "../include/libstemmer.h" +#include "../runtime/api.h" +#include "@MODULES_H@" + +struct sb_stemmer { + struct SN_env * (*create)(void); + void (*close)(struct SN_env *); + int (*stem)(struct SN_env *); + + struct SN_env * env; +}; + +extern const char ** +sb_stemmer_list(void) +{ + return algorithm_names; +} + +static stemmer_encoding_t +sb_getenc(const char * charenc) +{ + const struct stemmer_encoding * encoding; + if (charenc == NULL) return ENC_UTF_8; + for (encoding = encodings; encoding->name != 0; encoding++) { + if (strcmp(encoding->name, charenc) == 0) break; + } + if (encoding->name == NULL) return ENC_UNKNOWN; + return encoding->enc; +} + +extern struct sb_stemmer * +sb_stemmer_new(const char * algorithm, const char * charenc) +{ + stemmer_encoding_t enc; + const struct stemmer_modules * module; + struct sb_stemmer * stemmer; + + enc = sb_getenc(charenc); + if (enc == ENC_UNKNOWN) return NULL; + + for (module = modules; module->name != 0; module++) { + if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; + } + if (module->name == NULL) return NULL; + + stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); + if (stemmer == NULL) return NULL; + + stemmer->create = module->create; + stemmer->close = module->close; + stemmer->stem = module->stem; + + stemmer->env = stemmer->create(); + if (stemmer->env == NULL) + { + sb_stemmer_delete(stemmer); + return NULL; + } + + return stemmer; +} + +void +sb_stemmer_delete(struct sb_stemmer * stemmer) +{ + if (stemmer == 0) return; + if (stemmer->close) { + stemmer->close(stemmer->env); + stemmer->close = 0; + } + free(stemmer); +} + +const sb_symbol * +sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) +{ + int ret; + if (SN_set_current(stemmer->env, size, (const symbol *)(word))) + { + stemmer->env->l = 0; + return NULL; + } + ret = stemmer->stem(stemmer->env); + if (ret < 0) return NULL; + stemmer->env->p[stemmer->env->l] = 0; + return (const sb_symbol *)(stemmer->env->p); +} + +int +sb_stemmer_length(struct sb_stemmer * stemmer) +{ + return stemmer->env->l; +} diff --git a/contrib/snowball/libstemmer/mkmodules.pl b/contrib/snowball/libstemmer/mkmodules.pl new file mode 100755 index 0000000..dd66787 --- /dev/null +++ b/contrib/snowball/libstemmer/mkmodules.pl @@ -0,0 +1,267 @@ +#!/usr/bin/env perl +use strict; +use 5.006; +use warnings; + +my $progname = $0; + +if (scalar @ARGV < 4 || scalar @ARGV > 5) { + print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n"; + exit 1; +} + +my $outname = shift(@ARGV); +my $c_src_dir = shift(@ARGV); +my $descfile = shift(@ARGV); +my $srclistfile = shift(@ARGV); +my $enc_only; +my $extn = ''; +if (@ARGV) { + $enc_only = shift(@ARGV); + $extn = '_'.$enc_only; +} + +my %aliases = (); +my %algorithms = (); +my %algorithm_encs = (); + +my %encs = (); + +sub addalgenc($$) { + my $alg = shift(); + my $enc = shift(); + + if (defined $enc_only) { + my $norm_enc = lc $enc; + $norm_enc =~ s/_//g; + if ($norm_enc ne $enc_only) { + return; + } + } + + if (defined $algorithm_encs{$alg}) { + my $hashref = $algorithm_encs{$alg}; + $$hashref{$enc}=1; + } else { + my %newhash = ($enc => 1); + $algorithm_encs{$alg}=\%newhash; + } + + $encs{$enc} = 1; +} + +sub readinput() +{ + open DESCFILE, $descfile; + my $line; + while ($line = <DESCFILE>) + { + next if $line =~ m/^\s*#/; + next if $line =~ m/^\s*$/; + my ($alg,$encstr,$aliases) = split(/\s+/, $line); + my $enc; + my $alias; + + $algorithms{$alg} = 1; + foreach $alias (split(/,/, $aliases)) { + foreach $enc (split(/,/, $encstr)) { + # print "$alias, $enc\n"; + $aliases{$alias} = $alg; + addalgenc($alg, $enc); + } + } + } +} + +sub printoutput() +{ + open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; + + print OUT <<EOS; +/* $outname: List of stemming modules. + * + * This file is generated by mkmodules.pl from a list of module names. + * Do not edit manually. + * +EOS + + my $line = " * Modules included by this file are: "; + print OUT $line; + my $linelen = length($line); + + my $need_sep = 0; + my $lang; + my $enc; + my @algorithms = sort keys(%algorithms); + foreach $lang (@algorithms) { + if ($need_sep) { + if (($linelen + 2 + length($lang)) > 77) { + print OUT ",\n * "; + $linelen = 3; + } else { + print OUT ', '; + $linelen += 2; + } + } + print OUT $lang; + $linelen += length($lang); + $need_sep = 1; + } + print OUT "\n */\n\n"; + + foreach $lang (@algorithms) { + my $hashref = $algorithm_encs{$lang}; + foreach $enc (sort keys (%$hashref)) { + print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n"; + } + } + + print OUT <<EOS; + +typedef enum { + ENC_UNKNOWN=0, +EOS + my $neednl = 0; + for $enc (sort keys %encs) { + print OUT ",\n" if $neednl; + print OUT " ENC_${enc}"; + $neednl = 1; + } + print OUT <<EOS; + +} stemmer_encoding_t; + +struct stemmer_encoding { + const char * name; + stemmer_encoding_t enc; +}; +static const struct stemmer_encoding encodings[] = { +EOS + for $enc (sort keys %encs) { + print OUT " {\"${enc}\", ENC_${enc}},\n"; + } + print OUT <<EOS; + {0,ENC_UNKNOWN} +}; + +struct stemmer_modules { + const char * name; + stemmer_encoding_t enc; + struct SN_env * (*create)(void); + void (*close)(struct SN_env *); + int (*stem)(struct SN_env *); +}; +static const struct stemmer_modules modules[] = { +EOS + + for $lang (sort keys %aliases) { + my $l = $aliases{$lang}; + my $hashref = $algorithm_encs{$l}; + my $enc; + foreach $enc (sort keys (%$hashref)) { + my $p = "${l}_${enc}"; + print OUT " {\"$lang\", ENC_$enc, ${p}_create_env, ${p}_close_env, ${p}_stem},\n"; + } + } + + print OUT <<EOS; + {0,ENC_UNKNOWN,0,0,0} +}; +EOS + + print OUT <<EOS; +static const char * algorithm_names[] = { +EOS + + for $lang (@algorithms) { + print OUT " \"$lang\", \n"; + } + + print OUT <<EOS; + 0 +}; +EOS + close OUT or die "Can't close ${outname}: $!\n"; +} + +sub printsrclist() +{ + open (OUT, ">$srclistfile") or die "Can't open output file `$srclistfile': $!\n"; + + print OUT <<EOS; +# $srclistfile: List of stemming module source files +# +# This file is generated by mkmodules.pl from a list of module names. +# Do not edit manually. +# +EOS + + my $line = "# Modules included by this file are: "; + print OUT $line; + my $linelen = length($line); + + my $need_sep = 0; + my $lang; + my $srcfile; + my $enc; + my @algorithms = sort keys(%algorithms); + foreach $lang (@algorithms) { + if ($need_sep) { + if (($linelen + 2 + length($lang)) > 77) { + print OUT ",\n# "; + $linelen = 3; + } else { + print OUT ', '; + $linelen += 2; + } + } + print OUT $lang; + $linelen += length($lang); + $need_sep = 1; + } + + print OUT "\n\nsnowball_sources= \\\n"; + for $lang (sort keys %aliases) { + my $hashref = $algorithm_encs{$lang}; + my $enc; + foreach $enc (sort keys (%$hashref)) { + print OUT " src_c/stem_${enc}_${lang}.c \\\n"; + } + } + + $need_sep = 0; + for $srcfile ('runtime/api.c', + 'runtime/utilities.c', + "libstemmer/libstemmer${extn}.c") { + print OUT " \\\n" if $need_sep; + print OUT " $srcfile"; + $need_sep = 1; + } + + print OUT "\n\nsnowball_headers= \\\n"; + for $lang (sort keys %aliases) { + my $hashref = $algorithm_encs{$lang}; + my $enc; + foreach $enc (sort keys (%$hashref)) { + my $p = "${lang}_${enc}"; + print OUT " src_c/stem_${enc}_${lang}.h \\\n"; + } + } + + $need_sep = 0; + for $srcfile ('include/libstemmer.h', + "libstemmer/modules${extn}.h", + 'runtime/api.h', + 'runtime/header.h') { + print OUT " \\\n" if $need_sep; + print OUT " $srcfile"; + $need_sep = 1; + } + + print OUT "\n\n"; + close OUT or die "Can't close ${srclistfile}: $!\n"; +} + +readinput(); +printoutput(); +printsrclist(); diff --git a/contrib/snowball/libstemmer/modules.txt b/contrib/snowball/libstemmer/modules.txt new file mode 100644 index 0000000..f6dcc7e --- /dev/null +++ b/contrib/snowball/libstemmer/modules.txt @@ -0,0 +1,58 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8, and also with +# the most commonly used encoding. + +arabic UTF_8 arabic,ar,ara +danish UTF_8 danish,da,dan +dutch UTF_8 dutch,nl,dut,nld +english UTF_8 english,en,eng +finnish UTF_8 finnish,fi,fin +french UTF_8 french,fr,fre,fra +german UTF_8 german,de,ger,deu +greek UTF_8 greek,el,gre,ell +hindi UTF_8 hindi,hi,hin +hungarian UTF_8 hungarian,hu,hun +indonesian UTF_8 indonesian,id,ind +italian UTF_8 italian,it,ita +lithuanian UTF_8 lithuanian,lt,lit +nepali UTF_8 nepali,ne,nep +norwegian UTF_8 norwegian,no,nor +portuguese UTF_8 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8 russian,ru,rus +serbian UTF_8 serbian,sr,srp +spanish UTF_8 spanish,es,esl,spa +swedish UTF_8 swedish,sv,swe +tamil UTF_8 tamil,ta,tam +turkish UTF_8 turkish,tr,tur + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8 porter english + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# german2 - This is a slight modification of the german stemmer. +#german2 UTF_8,ISO_8859_1 german2 german +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8,ISO_8859_1 lovins english diff --git a/contrib/snowball/libstemmer/modules_utf8.txt b/contrib/snowball/libstemmer/modules_utf8.txt new file mode 100644 index 0000000..60a0e1d --- /dev/null +++ b/contrib/snowball/libstemmer/modules_utf8.txt @@ -0,0 +1,49 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8. + +danish UTF_8 danish,da,dan +dutch UTF_8 dutch,nl,dut,nld +english UTF_8 english,en,eng +finnish UTF_8 finnish,fi,fin +french UTF_8 french,fr,fre,fra +german UTF_8 german,de,ger,deu +hungarian UTF_8 hungarian,hu,hun +italian UTF_8 italian,it,ita +norwegian UTF_8 norwegian,no,nor +portuguese UTF_8 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8 russian,ru,rus +spanish UTF_8 spanish,es,esl,spa +swedish UTF_8 swedish,sv,swe +turkish UTF_8 turkish,tr,tur + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8 porter + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# german2 - This is a slight modification of the german stemmer. +#german2 UTF_8 german2 +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8 kraaij_pohlmann +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8 lovins diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c new file mode 100644 index 0000000..21ea5f2 --- /dev/null +++ b/contrib/snowball/runtime/api.c @@ -0,0 +1,58 @@ + +#include <stdlib.h> /* for calloc, free */ +#include "header.h" + +extern struct SN_env * SN_create_env(int S_size, int I_size) +{ + struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); + if (z == NULL) return NULL; + z->p = create_s(); + if (z->p == NULL) goto error; + if (S_size) + { + int i; + z->S = (symbol * *) calloc(S_size, sizeof(symbol *)); + if (z->S == NULL) goto error; + + for (i = 0; i < S_size; i++) + { + z->S[i] = create_s(); + if (z->S[i] == NULL) goto error; + } + } + + if (I_size) + { + z->I = (int *) calloc(I_size, sizeof(int)); + if (z->I == NULL) goto error; + } + + return z; +error: + SN_close_env(z, S_size); + return NULL; +} + +extern void SN_close_env(struct SN_env * z, int S_size) +{ + if (z == NULL) return; + if (S_size) + { + int i; + for (i = 0; i < S_size; i++) + { + lose_s(z->S[i]); + } + free(z->S); + } + free(z->I); + if (z->p) lose_s(z->p); + free(z); +} + +extern int SN_set_current(struct SN_env * z, int size, const symbol * s) +{ + int err = replace_s(z, 0, z->l, size, s, NULL); + z->c = 0; + return err; +} diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h new file mode 100644 index 0000000..ba9d1c1 --- /dev/null +++ b/contrib/snowball/runtime/api.h @@ -0,0 +1,32 @@ + +typedef unsigned char symbol; + +/* Or replace 'char' above with 'short' for 16 bit characters. + + More precisely, replace 'char' with whatever type guarantees the + character width you need. Note however that sizeof(symbol) should divide + HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise + there is an alignment problem. In the unlikely event of a problem here, + consult Martin Porter. + +*/ + +struct SN_env { + symbol * p; + int c; int l; int lb; int bra; int ket; + symbol * * S; + int * I; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +extern struct SN_env * SN_create_env(int S_size, int I_size); +extern void SN_close_env(struct SN_env * z, int S_size); + +extern int SN_set_current(struct SN_env * z, int size, const symbol * s); + +#ifdef __cplusplus +} +#endif diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h new file mode 100644 index 0000000..85a42fd --- /dev/null +++ b/contrib/snowball/runtime/header.h @@ -0,0 +1,59 @@ + +#include <limits.h> + +#include "api.h" + +#define MAXINT INT_MAX +#define MININT INT_MIN + +#define HEAD 2*sizeof(int) + +#define SIZE(p) ((int *)(p))[-1] +#define SET_SIZE(p, n) ((int *)(p))[-1] = n +#define CAPACITY(p) ((int *)(p))[-2] + +struct among +{ int s_size; /* number of chars in string */ + const symbol * s; /* search string */ + int substring_i;/* index to longest matching substring */ + int result; /* result of the lookup */ + int (* function)(struct SN_env *); +}; + +extern symbol * create_s(void); +extern void lose_s(symbol * p); + +extern int skip_utf8(const symbol * p, int c, int lb, int l, int n); + +extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); + +extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); + +extern int eq_s(struct SN_env * z, int s_size, const symbol * s); +extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); +extern int eq_v(struct SN_env * z, const symbol * p); +extern int eq_v_b(struct SN_env * z, const symbol * p); + +extern int find_among(struct SN_env * z, const struct among * v, int v_size); +extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); + +extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); +extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); +extern int slice_from_v(struct SN_env * z, const symbol * p); +extern int slice_del(struct SN_env * z); + +extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); +extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); + +extern symbol * slice_to(struct SN_env * z, symbol * p); +extern symbol * assign_to(struct SN_env * z, symbol * p); + +extern int len_utf8(const symbol * p); + +extern void debug(struct SN_env * z, int number, int line_count); diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c new file mode 100644 index 0000000..1cfd1a1 --- /dev/null +++ b/contrib/snowball/runtime/utilities.c @@ -0,0 +1,503 @@ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "header.h" + +#define CREATE_SIZE 1 + +extern symbol * create_s(void) { + symbol * p; + void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); + if (mem == NULL) return NULL; + p = (symbol *) (HEAD + (char *) mem); + CAPACITY(p) = CREATE_SIZE; + SET_SIZE(p, 0); + return p; +} + +extern void lose_s(symbol * p) { + if (p == NULL) return; + free((char *) p - HEAD); +} + +/* + new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c + if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new + position, or -1 on failure. + + -- used to implement hop and next in the utf8 case. +*/ + +extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { + int b; + if (n >= 0) { + for (; n > 0; n--) { + if (c >= l) return -1; + b = p[c++]; + if (b >= 0xC0) { /* 1100 0000 */ + while (c < l) { + b = p[c]; + if (b >= 0xC0 || b < 0x80) break; + /* break unless b is 10------ */ + c++; + } + } + } + } else { + for (; n < 0; n++) { + if (c <= lb) return -1; + b = p[--c]; + if (b >= 0x80) { /* 1000 0000 */ + while (c > lb) { + b = p[c]; + if (b >= 0xC0) break; /* 1100 0000 */ + c--; + } + } + } + } + return c; +} + +/* Code for character groupings: utf8 cases */ + +static int get_utf8(const symbol * p, int c, int l, int * slot) { + int b0, b1, b2; + if (c >= l) return 0; + b0 = p[c++]; + if (b0 < 0xC0 || c == l) { /* 1100 0000 */ + *slot = b0; + return 1; + } + b1 = p[c++] & 0x3F; + if (b0 < 0xE0 || c == l) { /* 1110 0000 */ + *slot = (b0 & 0x1F) << 6 | b1; + return 2; + } + b2 = p[c++] & 0x3F; + if (b0 < 0xF0 || c == l) { /* 1111 0000 */ + *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; + return 3; + } + *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); + return 4; +} + +static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { + int a, b; + if (c <= lb) return 0; + b = p[--c]; + if (b < 0x80 || c == lb) { /* 1000 0000 */ + *slot = b; + return 1; + } + a = b & 0x3F; + b = p[--c]; + if (b >= 0xC0 || c == lb) { /* 1100 0000 */ + *slot = (b & 0x1F) << 6 | a; + return 2; + } + a |= (b & 0x3F) << 6; + b = p[--c]; + if (b >= 0xE0 || c == lb) { /* 1110 0000 */ + *slot = (b & 0xF) << 12 | a; + return 3; + } + *slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a; + return 4; +} + +extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + if (!w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c += w; + } while (repeat); + return 0; +} + +extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + if (!w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c -= w; + } while (repeat); + return 0; +} + +extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + if (!w) return -1; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return w; + z->c += w; + } while (repeat); + return 0; +} + +extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + if (!w) return -1; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return w; + z->c -= w; + } while (repeat); + return 0; +} + +/* Code for character groupings: non-utf8 cases */ + +extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c++; + } while (repeat); + return 0; +} + +extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c--; + } while (repeat); + return 0; +} + +extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return 1; + z->c++; + } while (repeat); + return 0; +} + +extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return 1; + z->c--; + } while (repeat); + return 0; +} + +extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { + if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; + z->c += s_size; return 1; +} + +extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { + if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; + z->c -= s_size; return 1; +} + +extern int eq_v(struct SN_env * z, const symbol * p) { + return eq_s(z, SIZE(p), p); +} + +extern int eq_v_b(struct SN_env * z, const symbol * p) { + return eq_s_b(z, SIZE(p), p); +} + +extern int find_among(struct SN_env * z, const struct among * v, int v_size) { + + int i = 0; + int j = v_size; + + int c = z->c; int l = z->l; + const symbol * q = z->p + c; + + const struct among * w; + + int common_i = 0; + int common_j = 0; + + int first_key_inspected = 0; + + while (1) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; /* smaller */ + w = v + k; + { + int i2; for (i2 = common; i2 < w->s_size; i2++) { + if (c + common == l) { diff = -1; break; } + diff = q[common] - w->s[i2]; + if (diff != 0) break; + common++; + } + } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } + if (j - i <= 1) { + if (i > 0) break; /* v->s has been inspected */ + if (j == i) break; /* only one item in v */ + + /* - but now we need to go round once more to get + v->s inspected. This looks messy, but is actually + the optimal approach. */ + + if (first_key_inspected) break; + first_key_inspected = 1; + } + } + while (1) { + w = v + i; + if (common_i >= w->s_size) { + z->c = c + w->s_size; + if (w->function == 0) return w->result; + { + int res = w->function(z); + z->c = c + w->s_size; + if (res) return w->result; + } + } + i = w->substring_i; + if (i < 0) return 0; + } +} + +/* find_among_b is for backwards processing. Same comments apply */ + +extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { + + int i = 0; + int j = v_size; + + int c = z->c; int lb = z->lb; + const symbol * q = z->p + c - 1; + + const struct among * w; + + int common_i = 0; + int common_j = 0; + + int first_key_inspected = 0; + + while (1) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; + w = v + k; + { + int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { + if (c - common == lb) { diff = -1; break; } + diff = q[- common] - w->s[i2]; + if (diff != 0) break; + common++; + } + } + if (diff < 0) { j = k; common_j = common; } + else { i = k; common_i = common; } + if (j - i <= 1) { + if (i > 0) break; + if (j == i) break; + if (first_key_inspected) break; + first_key_inspected = 1; + } + } + while (1) { + w = v + i; + if (common_i >= w->s_size) { + z->c = c - w->s_size; + if (w->function == 0) return w->result; + { + int res = w->function(z); + z->c = c - w->s_size; + if (res) return w->result; + } + } + i = w->substring_i; + if (i < 0) return 0; + } +} + + +/* Increase the size of the buffer pointed to by p to at least n symbols. + * If insufficient memory, returns NULL and frees the old buffer. + */ +static symbol * increase_size(symbol * p, int n) { + symbol * q; + int new_size = n + 20; + void * mem = realloc((char *) p - HEAD, + HEAD + (new_size + 1) * sizeof(symbol)); + if (mem == NULL) { + lose_s(p); + return NULL; + } + q = (symbol *) (HEAD + (char *)mem); + CAPACITY(q) = new_size; + return q; +} + +/* to replace symbols between c_bra and c_ket in z->p by the + s_size symbols at s. + Returns 0 on success, -1 on error. + Also, frees z->p (and sets it to NULL) on error. +*/ +extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) +{ + int adjustment; + int len; + if (z->p == NULL) { + z->p = create_s(); + if (z->p == NULL) return -1; + } + adjustment = s_size - (c_ket - c_bra); + len = SIZE(z->p); + if (adjustment != 0) { + if (adjustment + len > CAPACITY(z->p)) { + z->p = increase_size(z->p, adjustment + len); + if (z->p == NULL) return -1; + } + memmove(z->p + c_ket + adjustment, + z->p + c_ket, + (len - c_ket) * sizeof(symbol)); + SET_SIZE(z->p, adjustment + len); + z->l += adjustment; + if (z->c >= c_ket) + z->c += adjustment; + else if (z->c > c_bra) + z->c = c_bra; + } + if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); + if (adjptr != NULL) + *adjptr = adjustment; + return 0; +} + +static int slice_check(struct SN_env * z) { + + if (z->bra < 0 || + z->bra > z->ket || + z->ket > z->l || + z->p == NULL || + z->l > SIZE(z->p)) /* this line could be removed */ + { +#if 0 + fprintf(stderr, "faulty slice operation:\n"); + debug(z, -1, 0); +#endif + return -1; + } + return 0; +} + +extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { + if (slice_check(z)) return -1; + return replace_s(z, z->bra, z->ket, s_size, s, NULL); +} + +extern int slice_from_v(struct SN_env * z, const symbol * p) { + return slice_from_s(z, SIZE(p), p); +} + +extern int slice_del(struct SN_env * z) { + return slice_from_s(z, 0, 0); +} + +extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { + int adjustment; + if (replace_s(z, bra, ket, s_size, s, &adjustment)) + return -1; + if (bra <= z->bra) z->bra += adjustment; + if (bra <= z->ket) z->ket += adjustment; + return 0; +} + +extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { + return insert_s(z, bra, ket, SIZE(p), p); +} + +extern symbol * slice_to(struct SN_env * z, symbol * p) { + if (slice_check(z)) { + lose_s(p); + return NULL; + } + { + int len = z->ket - z->bra; + if (CAPACITY(p) < len) { + p = increase_size(p, len); + if (p == NULL) + return NULL; + } + memmove(p, z->p + z->bra, len * sizeof(symbol)); + SET_SIZE(p, len); + } + return p; +} + +extern symbol * assign_to(struct SN_env * z, symbol * p) { + int len = z->l; + if (CAPACITY(p) < len) { + p = increase_size(p, len); + if (p == NULL) + return NULL; + } + memmove(p, z->p, len * sizeof(symbol)); + SET_SIZE(p, len); + return p; +} + +extern int len_utf8(const symbol * p) { + int size = SIZE(p); + int len = 0; + while (size--) { + symbol b = *p++; + if (b >= 0xC0 || b < 0x80) ++len; + } + return len; +} + +#if 0 +extern void debug(struct SN_env * z, int number, int line_count) { + int i; + int limit = SIZE(z->p); + /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ + if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); + for (i = 0; i <= limit; i++) { + if (z->lb == i) printf("{"); + if (z->bra == i) printf("["); + if (z->c == i) printf("|"); + if (z->ket == i) printf("]"); + if (z->l == i) printf("}"); + if (i < limit) + { int ch = z->p[i]; + if (ch == 0) ch = '#'; + printf("%c", ch); + } + } + printf("'\n"); +} +#endif |