diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/mroonga/vendor/groonga/plugins/token_filters | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/token_filters')
6 files changed, 533 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt new file mode 100644 index 00000000..4aa7d09b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt @@ -0,0 +1,63 @@ +# Copyright(C) 2014 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(TOKEN_FILTERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/token_filters") + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stop_word_sources.am + STOP_WORD_SOURCES) +set_source_files_properties(${STOP_WORD_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(stop_word_token_filter STATIC ${STOP_WORD_SOURCES}) + set_target_properties( + stop_word_token_filter + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(stop_word_token_filter MODULE ${STOP_WORD_SOURCES}) + set_target_properties(stop_word_token_filter PROPERTIES + PREFIX "" + OUTPUT_NAME "stop_word") + install(TARGETS stop_word_token_filter DESTINATION "${TOKEN_FILTERS_DIR}") +endif() +target_link_libraries(stop_word_token_filter libgroonga) + +if(GRN_WITH_LIBSTEMMER) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stem_sources.am STEM_SOURCES) + include_directories(${LIBSTEMMER_INCLUDE_DIRS}) + link_directories(${LIBSTEMMER_LIBRARY_DIRS}) + set_source_files_properties(${STEM_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + if(GRN_EMBED) + add_library(stem_token_filter STATIC ${STEM_SOURCES}) + set_target_properties( + stem_token_filter + PROPERTIES + POSITION_INDEPENDENT_CODE ON) + else() + add_library(stem_token_filter MODULE ${STEM_SOURCES}) + set_target_properties(stem_token_filter PROPERTIES + PREFIX "" + OUTPUT_NAME "stem") + install(TARGETS stem_token_filter DESTINATION "${TOKEN_FILTERS_DIR}") + endif() + target_link_libraries(stem_token_filter libgroonga ${LIBSTEMMER_LIBRARIES}) +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am b/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am new file mode 100644 index 00000000..c63bef7a --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am @@ -0,0 +1,28 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +token_filter_plugins_LTLIBRARIES = +token_filter_plugins_LTLIBRARIES += stop_word.la +if WITH_LIBSTEMMER +token_filter_plugins_LTLIBRARIES += stem.la +endif + +include stop_word_sources.am + +include stem_sources.am +stem_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBSTEMMER_CFLAGS) +stem_la_LIBADD = $(LIBS) $(LIBSTEMMER_LIBS) +stem_la_LDFLAGS = $(AM_LDFLAGS) $(LIBSTEMMER_LDFLAGS) diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c b/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c new file mode 100644 index 00000000..2144eb09 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c @@ -0,0 +1,279 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG token_filters_stem +#endif + +#include <grn_str.h> + +#include <groonga.h> +#include <groonga/token_filter.h> + +#include <ctype.h> +#include <string.h> + +#include <libstemmer.h> + +typedef struct { + struct sb_stemmer *stemmer; + grn_tokenizer_token token; + grn_obj buffer; +} grn_stem_token_filter; + +static void * +stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) +{ + grn_stem_token_filter *token_filter; + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate grn_stem_token_filter"); + return NULL; + } + + { + /* TODO: Support other languages. */ + const char *algorithm = "english"; + const char *encoding = "UTF_8"; + token_filter->stemmer = sb_stemmer_new(algorithm, encoding); + if (!token_filter->stemmer) { + GRN_PLUGIN_FREE(ctx, token_filter); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[token-filter][stem] " + "failed to create stemmer: " + "algorithm=<%s>, encoding=<%s>", + algorithm, encoding); + return NULL; + } + } + grn_tokenizer_token_init(ctx, &(token_filter->token)); + GRN_TEXT_INIT(&(token_filter->buffer), 0); + + return token_filter; +} + +static grn_bool +is_stemmable(grn_obj *data, grn_bool *is_all_upper) +{ + const char *current, *end; + grn_bool have_lower = GRN_FALSE; + grn_bool have_upper = GRN_FALSE; + + *is_all_upper = GRN_FALSE; + + switch (data->header.domain) { + case GRN_DB_SHORT_TEXT : + case GRN_DB_TEXT : + case GRN_DB_LONG_TEXT : + break; + default : + return GRN_FALSE; + } + + current = GRN_TEXT_VALUE(data); + end = current + GRN_TEXT_LEN(data); + + for (; current < end; current++) { + if (islower((unsigned char)*current)) { + have_lower = GRN_TRUE; + continue; + } + if (isupper((unsigned char)*current)) { + have_upper = GRN_TRUE; + continue; + } + if (isdigit((unsigned char)*current)) { + continue; + } + switch (*current) { + case '-' : + case '\'' : + break; + default : + return GRN_FALSE; + } + } + + if (!have_lower && have_upper) { + *is_all_upper = GRN_TRUE; + } + + return GRN_TRUE; +} + +static void +normalize(grn_ctx *ctx, + const char *string, unsigned int length, + grn_obj *normalized) +{ + const char *current, *end; + const char *unwritten; + + current = unwritten = string; + end = current + length; + + for (; current < end; current++) { + if (isupper((unsigned char)*current)) { + if (current > unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } + GRN_TEXT_PUTC(ctx, normalized, tolower((unsigned char)*current)); + unwritten = current + 1; + } + } + + if (current != unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } +} + +static void +unnormalize(grn_ctx *ctx, + const char *string, unsigned int length, + grn_obj *normalized) +{ + const char *current, *end; + const char *unwritten; + + current = unwritten = string; + end = current + length; + + for (; current < end; current++) { + if (islower((unsigned char)*current)) { + if (current > unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } + GRN_TEXT_PUTC(ctx, normalized, toupper((unsigned char)*current)); + unwritten = current + 1; + } + } + + if (current != unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } +} + +static void +stem_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) +{ + grn_stem_token_filter *token_filter = user_data; + grn_obj *data; + grn_bool is_all_upper = GRN_FALSE; + + if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { + return; + } + + data = grn_token_get_data(ctx, current_token); + if (!is_stemmable(data, &is_all_upper)) { + return; + } + + { + const sb_symbol *stemmed; + + if (is_all_upper) { + grn_obj *buffer; + buffer = &(token_filter->buffer); + GRN_BULK_REWIND(buffer); + normalize(ctx, + GRN_TEXT_VALUE(data), + GRN_TEXT_LEN(data), + buffer); + stemmed = sb_stemmer_stem(token_filter->stemmer, + GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer)); + if (stemmed) { + GRN_BULK_REWIND(buffer); + unnormalize(ctx, + stemmed, + sb_stemmer_length(token_filter->stemmer), + buffer); + grn_token_set_data(ctx, next_token, + GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer)); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate memory for stemmed word: <%.*s> " + "(normalized: <%.*s>)", + (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data), + (int)GRN_TEXT_LEN(buffer), GRN_TEXT_VALUE(buffer)); + } + } else { + stemmed = sb_stemmer_stem(token_filter->stemmer, + GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); + if (stemmed) { + grn_token_set_data(ctx, next_token, + stemmed, + sb_stemmer_length(token_filter->stemmer)); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate memory for stemmed word: <%.*s>", + (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); + } + } + } +} + +static void +stem_fin(grn_ctx *ctx, void *user_data) +{ + grn_stem_token_filter *token_filter = user_data; + if (!token_filter) { + return; + } + + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + if (token_filter->stemmer) { + sb_stemmer_delete(token_filter->stemmer); + } + GRN_OBJ_FIN(ctx, &(token_filter->buffer)); + GRN_PLUGIN_FREE(ctx, token_filter); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_token_filter_register(ctx, + "TokenFilterStem", -1, + stem_init, + stem_filter, + stem_fin); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am b/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am new file mode 100644 index 00000000..d02a3952 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am @@ -0,0 +1,2 @@ +stem_la_SOURCES = \ + stem.c diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c new file mode 100644 index 00000000..a06d772f --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c @@ -0,0 +1,159 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG token_filters_stop_word +#endif + +#include <grn_str.h> + +#include <groonga.h> +#include <groonga/token_filter.h> + +#include <string.h> + +#define COLUMN_NAME "is_stop_word" + +typedef struct { + grn_obj *table; + grn_token_mode mode; + grn_obj *column; + grn_obj value; + grn_tokenizer_token token; +} grn_stop_word_token_filter; + +static void * +stop_word_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) +{ + grn_stop_word_token_filter *token_filter; + + if (mode != GRN_TOKEN_GET) { + return NULL; + } + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stop_word_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stop-word] " + "failed to allocate grn_stop_word_token_filter"); + return NULL; + } + + token_filter->table = table; + token_filter->mode = mode; + token_filter->column = grn_obj_column(ctx, + token_filter->table, + COLUMN_NAME, + strlen(COLUMN_NAME)); + if (!token_filter->column) { + char table_name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int table_name_size; + + table_name_size = grn_obj_name(ctx, + token_filter->table, + table_name, + GRN_TABLE_MAX_KEY_SIZE); + GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR, + "[token-filter][stop-word] " + "column for judging stop word doesn't exit: <%.*s.%s>", + table_name_size, + table_name, + COLUMN_NAME); + GRN_PLUGIN_FREE(ctx, token_filter); + return NULL; + } + + GRN_BOOL_INIT(&(token_filter->value), 0); + grn_tokenizer_token_init(ctx, &(token_filter->token)); + + return token_filter; +} + +static void +stop_word_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) +{ + grn_stop_word_token_filter *token_filter = user_data; + grn_id id; + grn_obj *data; + + if (!token_filter) { + return; + } + + data = grn_token_get_data(ctx, current_token); + id = grn_table_get(ctx, + token_filter->table, + GRN_TEXT_VALUE(data), + GRN_TEXT_LEN(data)); + if (id != GRN_ID_NIL) { + GRN_BULK_REWIND(&(token_filter->value)); + grn_obj_get_value(ctx, + token_filter->column, + id, + &(token_filter->value)); + if (GRN_BOOL_VALUE(&(token_filter->value))) { + grn_tokenizer_status status; + status = grn_token_get_status(ctx, current_token); + status |= GRN_TOKEN_SKIP; + grn_token_set_status(ctx, next_token, status); + } + } +} + +static void +stop_word_fin(grn_ctx *ctx, void *user_data) +{ + grn_stop_word_token_filter *token_filter = user_data; + if (!token_filter) { + return; + } + + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + grn_obj_unlink(ctx, token_filter->column); + grn_obj_unlink(ctx, &(token_filter->value)); + GRN_PLUGIN_FREE(ctx, token_filter); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_token_filter_register(ctx, + "TokenFilterStopWord", -1, + stop_word_init, + stop_word_filter, + stop_word_fin); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am new file mode 100644 index 00000000..bab89551 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am @@ -0,0 +1,2 @@ +stop_word_la_SOURCES = \ + stop_word.c |