diff options
Diffstat (limited to 'contrib/snowball/algorithms/turkish.sbl')
-rw-r--r-- | contrib/snowball/algorithms/turkish.sbl | 470 |
1 files changed, 470 insertions, 0 deletions
diff --git a/contrib/snowball/algorithms/turkish.sbl b/contrib/snowball/algorithms/turkish.sbl new file mode 100644 index 0000000..eadd61d --- /dev/null +++ b/contrib/snowball/algorithms/turkish.sbl @@ -0,0 +1,470 @@ +/* Stemmer for Turkish + * author: Evren (Kapusuz) Çilden + * email: evren.kapusuz at gmail.com + * version: 1.0 (15.01.2007) + + + * stems nominal verb suffixes + * stems nominal inflections + * more than one syllable word check + * (y,n,s,U) context check + * vowel harmony check + * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) + + * The stemming algorithm is based on the paper "An Affix Stripping + * Morphological Analyzer for Turkish" by Gülşen Eryiğit and + * Eşref Adalı (Proceedings of the IAESTED International Conference + * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, + * Innsbruck, Austria + + * Turkish is an agglutinative language and has a very rich morphological + * structure. In Turkish, you can form many different words from a single stem + * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means + * "You had been the doctor of him". The stem of the word is "doktor" and it + * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about + * the append order of suffixes can be clearly described as FSMs. + * The paper referenced above defines some FSMs for right to left + * morphological analysis. I generated a method for constructing snowball + * expressions from right to left FSMs for stemming suffixes. +*/ + +routines ( + append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings + check_vowel_harmony // tests vowel harmony for suffixes + is_reserved_word // tests whether current string is a reserved word ('ad','soyad') + mark_cAsInA // nominal verb suffix + mark_DA // noun suffix + mark_DAn // noun suffix + mark_DUr // nominal verb suffix + mark_ki // noun suffix + mark_lAr // noun suffix, nominal verb suffix + mark_lArI // noun suffix + mark_nA // noun suffix + mark_ncA // noun suffix + mark_ndA // noun suffix + mark_ndAn // noun suffix + mark_nU // noun suffix + mark_nUn // noun suffix + mark_nUz // nominal verb suffix + mark_sU // noun suffix + mark_sUn // nominal verb suffix + mark_sUnUz // nominal verb suffix + mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, + mark_yA // noun suffix + mark_ylA // noun suffix + mark_yU // noun suffix + mark_yUm // nominal verb suffix + mark_yUz // nominal verb suffix + mark_yDU // nominal verb suffix + mark_yken // nominal verb suffix + mark_ymUs_ // nominal verb suffix + mark_ysA // nominal verb suffix + + mark_suffix_with_optional_y_consonant + mark_suffix_with_optional_U_vowel + mark_suffix_with_optional_n_consonant + mark_suffix_with_optional_s_consonant + + more_than_one_syllable_word + + post_process_last_consonants + postlude + + stem_nominal_verb_suffixes + stem_noun_suffixes + stem_suffix_chain_before_ki +) + +stringescapes { } + +/* Special characters in Unicode Latin-1 and Latin Extended-A */ +stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA +stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE +stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT +stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS +stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA +stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS + +booleans ( continue_stemming_noun_suffixes ) + +groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) + +define vowel 'ae{i'}io{o"}u{u"}' +define U '{i'}iu{u"}' + +// the vowel grouping definitions below are used for checking vowel harmony +define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' +define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' +define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' +define vowel4 'ei' // vowels that can end with suffixes containing 'i' +define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' +define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' + +externals ( stem ) + +backwardmode ( + // checks vowel harmony for possible suffixes, + // helps to detect whether the candidate for suffix applies to vowel harmony + // this rule is added to prevent over stemming + define check_vowel_harmony as ( + test + ( + (goto vowel) // if there is a vowel + ( + ('a' goto vowel1) or + ('e' goto vowel2) or + ('{i'}' goto vowel3) or + ('i' goto vowel4) or + ('o' goto vowel5) or + ('{o"}' goto vowel6) or + ('u' goto vowel5) or + ('{u"}' goto vowel6) + ) + ) + ) + + // if the last consonant before suffix is vowel and n then advance and delete + // if the last consonant before suffix is non vowel and n do nothing + // if the last consonant before suffix is not n then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_n_consonant as ( + ('n' (test vowel)) + or + ((not(test 'n')) test(next vowel)) + + ) + + // if the last consonant before suffix is vowel and s then advance and delete + // if the last consonant before suffix is non vowel and s do nothing + // if the last consonant before suffix is not s then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_s_consonant as ( + ('s' (test vowel)) + or + ((not(test 's')) test(next vowel)) + ) + + // if the last consonant before suffix is vowel and y then advance and delete + // if the last consonant before suffix is non vowel and y do nothing + // if the last consonant before suffix is not y then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_y_consonant as ( + ('y' (test vowel)) + or + ((not(test 'y')) test(next vowel)) + ) + + define mark_suffix_with_optional_U_vowel as ( + (U (test non-vowel)) + or + ((not(test U)) test(next non-vowel)) + + ) + + define mark_possessives as ( + among ('m{i'}z' 'miz' 'muz' 'm{u"}z' + 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') + (mark_suffix_with_optional_U_vowel) + ) + + define mark_sU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_s_consonant) + ) + + define mark_lArI as ( + among ('leri' 'lar{i'}') + ) + + define mark_yU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nU as ( + check_vowel_harmony + among ('n{i'}' 'ni' 'nu' 'n{u"}') + ) + + define mark_nUn as ( + check_vowel_harmony + among ('{i'}n' 'in' 'un' '{u"}n') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yA as ( + check_vowel_harmony + among('a' 'e') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nA as ( + check_vowel_harmony + among('na' 'ne') + ) + + define mark_DA as ( + check_vowel_harmony + among('da' 'de' 'ta' 'te') + ) + + define mark_ndA as ( + check_vowel_harmony + among('nda' 'nde') + ) + + define mark_DAn as ( + check_vowel_harmony + among('dan' 'den' 'tan' 'ten') + ) + + define mark_ndAn as ( + check_vowel_harmony + among('ndan' 'nden') + ) + + define mark_ylA as ( + check_vowel_harmony + among('la' 'le') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ki as ( + 'ki' + ) + + define mark_ncA as ( + check_vowel_harmony + among('ca' 'ce') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yUm as ( + check_vowel_harmony + among ('{i'}m' 'im' 'um' '{u"}m') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUn as ( + check_vowel_harmony + among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) + ) + + define mark_yUz as ( + check_vowel_harmony + among ('{i'}z' 'iz' 'uz' '{u"}z') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUnUz as ( + among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') + ) + + define mark_lAr as ( + check_vowel_harmony + among ('ler' 'lar') + ) + + define mark_nUz as ( + check_vowel_harmony + among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') + ) + + define mark_DUr as ( + check_vowel_harmony + among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') + ) + + define mark_cAsInA as ( + among ('cas{i'}na' 'cesine') + ) + + define mark_yDU as ( + check_vowel_harmony + among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' + 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' + 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' + 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') + (mark_suffix_with_optional_y_consonant) + ) + + // does not fully obey vowel harmony + define mark_ysA as ( + among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ymUs_ as ( + check_vowel_harmony + among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_yken as ( + 'ken' (mark_suffix_with_optional_y_consonant) + ) + + define stem_nominal_verb_suffixes as ( + [ + set continue_stemming_noun_suffixes + (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) + or + (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) + or + ( + mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) + unset continue_stemming_noun_suffixes + ) + or + (mark_nUz (mark_yDU or mark_ysA)) + or + ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) + or + (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) + ]delete + ) + + // stems noun suffix chains ending with -ki + define stem_suffix_chain_before_ki as ( + [ + mark_ki + ( + (mark_DA] delete try([ + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + + )) + or + (mark_nUn] delete try([ + (mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + or + (mark_ndA ( + (mark_lArI] delete) + or + ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) + or + (stem_suffix_chain_before_ki) + )) + ) + ) + + define stem_noun_suffixes as ( + ([mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + ([mark_ncA] delete + try( + ([mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + ([mark_lAr] delete stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndA or mark_nA) + ( + (mark_lArI] delete) + or + (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) + or + ( [mark_DAn] delete try ([ + ( + (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + ) + or + ([mark_nUn or mark_ylA] delete + try( + ([mark_lAr] delete stem_suffix_chain_before_ki) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + stem_suffix_chain_before_ki + ) + ) + or + ([mark_lArI] delete) + or + (stem_suffix_chain_before_ki) + or + ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + ) + + define post_process_last_consonants as ( + [substring] among ( + 'b' (<- 'p') + 'c' (<- '{c,}') + 'd' (<- 't') + '{g~}' (<- 'k') + ) + ) + + // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed + // like in 'kedim' -> 'ked' + // Turkish words don't usually end with 'd' or 'g' + // some very well known words are ignored (like 'ad' 'soyad' + // appends U to stems ending with d or g, decides which vowel to add + // based on the last vowel in the stem + define append_U_to_stems_ending_with_d_or_g as ( + test('d' or 'g') + (test((goto vowel) 'a' or '{i'}') <+ '{i'}') + or + (test((goto vowel) 'e' or 'i') <+ 'i') + or + (test((goto vowel) 'o' or 'u') <+ 'u') + or + (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') + ) + + define is_reserved_word as ( + 'ad' try 'soy' atlimit + ) +) + +// Tests if there are more than one syllables +// In Turkish each vowel indicates a distinct syllable +define more_than_one_syllable_word as ( + test (atleast 2 (gopast vowel)) +) + +define postlude as ( + backwards ( + not(is_reserved_word) + do append_U_to_stems_ending_with_d_or_g + do post_process_last_consonants + + ) +) + +define stem as ( + (more_than_one_syllable_word) + ( + backwards ( + do stem_nominal_verb_suffixes + continue_stemming_noun_suffixes + do stem_noun_suffixes + ) + + postlude + ) +) |