diff options
Diffstat (limited to 'src/plugins/fts-lucene/Snowball.cc')
-rw-r--r-- | src/plugins/fts-lucene/Snowball.cc | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/src/plugins/fts-lucene/Snowball.cc b/src/plugins/fts-lucene/Snowball.cc new file mode 100644 index 0000000..43b54e3 --- /dev/null +++ b/src/plugins/fts-lucene/Snowball.cc @@ -0,0 +1,151 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include <CLucene.h> +#include "SnowballAnalyzer.h" +#include "SnowballFilter.h" +#include <CLucene/util/CLStreams.h> +#include <CLucene/analysis/Analyzers.h> +#include <CLucene/analysis/standard/StandardTokenizer.h> +#include <CLucene/analysis/standard/StandardFilter.h> + +extern "C" { +#include "lib.h" +#include "buffer.h" +#include "unichar.h" +#include "lucene-wrapper.h" +}; + +CL_NS_USE(analysis) +CL_NS_USE(util) +CL_NS_USE2(analysis,standard) + +CL_NS_DEF2(analysis,snowball) + + /** Builds the named analyzer with no stop words. */ + SnowballAnalyzer::SnowballAnalyzer(normalizer_func_t *_normalizer, const char* _language) + : language(i_strdup(_language)), + normalizer(_normalizer), + stopSet(NULL), + prevstream(NULL) + { + } + + SnowballAnalyzer::~SnowballAnalyzer() + { + if (prevstream) + _CLDELETE(prevstream); + i_free(language); + if ( stopSet != NULL ) + _CLDELETE(stopSet); + } + + /** Builds the named analyzer with the given stop words. + */ + SnowballAnalyzer::SnowballAnalyzer(const char* language, const TCHAR** stopWords) + : language(i_strdup(language)), + normalizer(NULL), + stopSet(_CLNEW CLTCSetList(true)), + prevstream(NULL) + { + StopFilter::fillStopTable(stopSet,stopWords); + } + + TokenStream* SnowballAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) { + return this->tokenStream(fieldName,reader,false); + } + + /** Constructs a {@link StandardTokenizer} filtered by a {@link + StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ + TokenStream* SnowballAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader) { + BufferedReader* bufferedReader = reader->__asBufferedReader(); + TokenStream* result; + + if ( bufferedReader == NULL ) + result = _CLNEW StandardTokenizer( _CLNEW FilteredBufferedReader(reader, deleteReader), true ); + else + result = _CLNEW StandardTokenizer(bufferedReader, deleteReader); + + result = _CLNEW StandardFilter(result, true); + result = _CLNEW CL_NS(analysis)::LowerCaseFilter(result, true); + if (stopSet != NULL) + result = _CLNEW CL_NS(analysis)::StopFilter(result, true, stopSet); + result = _CLNEW SnowballFilter(result, normalizer, language, true); + return result; + } + + TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) { + if (prevstream) _CLDELETE(prevstream); + prevstream = this->tokenStream(fieldName, reader); + return prevstream; + } + + + + + + + /** Construct the named stemming filter. + * + * @param in the input tokens to stem + * @param name the name of a stemmer + */ + SnowballFilter::SnowballFilter(TokenStream* in, normalizer_func_t *normalizer, const char* language, bool deleteTS): + TokenFilter(in,deleteTS) + { + stemmer = sb_stemmer_new(language, NULL); //use utf8 encoding + this->normalizer = normalizer; + + if ( stemmer == NULL ){ + _CLTHROWA(CL_ERR_IllegalArgument, "language not available for stemming\n"); //todo: richer error + } + } + + SnowballFilter::~SnowballFilter(){ + sb_stemmer_delete(stemmer); + } + + /** Returns the next input Token, after being stemmed */ + Token* SnowballFilter::next(Token* token){ + if (input->next(token) == NULL) + return NULL; + + unsigned char utf8text[LUCENE_MAX_WORD_LEN*5+1]; + unsigned int len = I_MIN(LUCENE_MAX_WORD_LEN, token->termLength()); + + buffer_t buf = { { 0, 0 } }; + i_assert(sizeof(wchar_t) == sizeof(unichar_t)); + buffer_create_from_data(&buf, utf8text, sizeof(utf8text)); + uni_ucs4_to_utf8((const unichar_t *)token->termBuffer(), len, &buf); + + const sb_symbol* stemmed = sb_stemmer_stem(stemmer, utf8text, buf.used); + if ( stemmed == NULL ) + _CLTHROWA(CL_ERR_Runtime,"Out of memory"); + + int stemmedLen=sb_stemmer_length(stemmer); + + if (normalizer == NULL) { + unsigned int tchartext_size = + uni_utf8_strlen_n(stemmed, stemmedLen) + 1; + TCHAR tchartext[tchartext_size]; + lucene_utf8_n_to_tchar(stemmed, stemmedLen, tchartext, tchartext_size); + token->set(tchartext,token->startOffset(), token->endOffset(), token->type()); + } else T_BEGIN { + buffer_t *norm_buf = t_buffer_create(stemmedLen); + normalizer(stemmed, stemmedLen, norm_buf); + + unsigned int tchartext_size = + uni_utf8_strlen_n(norm_buf->data, norm_buf->used) + 1; + TCHAR tchartext[tchartext_size]; + lucene_utf8_n_to_tchar((const unsigned char *)norm_buf->data, + norm_buf->used, tchartext, tchartext_size); + token->set(tchartext,token->startOffset(), token->endOffset(), token->type()); + } T_END; + return token; + } + + +CL_NS_END2 |