summaryrefslogtreecommitdiffstats
path: root/storage/mroonga/vendor/groonga/plugins/token_filters
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/token_filters')
-rw-r--r--storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt63
-rw-r--r--storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am28
-rw-r--r--storage/mroonga/vendor/groonga/plugins/token_filters/stem.c279
-rw-r--r--storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am2
-rw-r--r--storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c159
-rw-r--r--storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am2
6 files changed, 533 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt
new file mode 100644
index 00000000..4aa7d09b
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt
@@ -0,0 +1,63 @@
+# Copyright(C) 2014 Brazil
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1 as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+include_directories(
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../lib
+ )
+
+set(TOKEN_FILTERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/token_filters")
+
+read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stop_word_sources.am
+ STOP_WORD_SOURCES)
+set_source_files_properties(${STOP_WORD_SOURCES}
+ PROPERTIES
+ COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}")
+if(GRN_EMBED)
+ add_library(stop_word_token_filter STATIC ${STOP_WORD_SOURCES})
+ set_target_properties(
+ stop_word_token_filter
+ PROPERTIES
+ POSITION_INDEPENDENT_CODE ON)
+else()
+ add_library(stop_word_token_filter MODULE ${STOP_WORD_SOURCES})
+ set_target_properties(stop_word_token_filter PROPERTIES
+ PREFIX ""
+ OUTPUT_NAME "stop_word")
+ install(TARGETS stop_word_token_filter DESTINATION "${TOKEN_FILTERS_DIR}")
+endif()
+target_link_libraries(stop_word_token_filter libgroonga)
+
+if(GRN_WITH_LIBSTEMMER)
+ read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stem_sources.am STEM_SOURCES)
+ include_directories(${LIBSTEMMER_INCLUDE_DIRS})
+ link_directories(${LIBSTEMMER_LIBRARY_DIRS})
+ set_source_files_properties(${STEM_SOURCES}
+ PROPERTIES
+ COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}")
+ if(GRN_EMBED)
+ add_library(stem_token_filter STATIC ${STEM_SOURCES})
+ set_target_properties(
+ stem_token_filter
+ PROPERTIES
+ POSITION_INDEPENDENT_CODE ON)
+ else()
+ add_library(stem_token_filter MODULE ${STEM_SOURCES})
+ set_target_properties(stem_token_filter PROPERTIES
+ PREFIX ""
+ OUTPUT_NAME "stem")
+ install(TARGETS stem_token_filter DESTINATION "${TOKEN_FILTERS_DIR}")
+ endif()
+ target_link_libraries(stem_token_filter libgroonga ${LIBSTEMMER_LIBRARIES})
+endif()
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am b/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am
new file mode 100644
index 00000000..c63bef7a
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am
@@ -0,0 +1,28 @@
+EXTRA_DIST = \
+ CMakeLists.txt
+
+AM_CPPFLAGS = \
+ -I$(top_builddir) \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/lib
+
+AM_LDFLAGS = \
+ -avoid-version \
+ -module \
+ -no-undefined
+
+LIBS = \
+ $(top_builddir)/lib/libgroonga.la
+
+token_filter_plugins_LTLIBRARIES =
+token_filter_plugins_LTLIBRARIES += stop_word.la
+if WITH_LIBSTEMMER
+token_filter_plugins_LTLIBRARIES += stem.la
+endif
+
+include stop_word_sources.am
+
+include stem_sources.am
+stem_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBSTEMMER_CFLAGS)
+stem_la_LIBADD = $(LIBS) $(LIBSTEMMER_LIBS)
+stem_la_LDFLAGS = $(AM_LDFLAGS) $(LIBSTEMMER_LDFLAGS)
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c b/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c
new file mode 100644
index 00000000..2144eb09
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c
@@ -0,0 +1,279 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2014 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*/
+
+#ifdef GRN_EMBEDDED
+# define GRN_PLUGIN_FUNCTION_TAG token_filters_stem
+#endif
+
+#include <grn_str.h>
+
+#include <groonga.h>
+#include <groonga/token_filter.h>
+
+#include <ctype.h>
+#include <string.h>
+
+#include <libstemmer.h>
+
+typedef struct {
+ struct sb_stemmer *stemmer;
+ grn_tokenizer_token token;
+ grn_obj buffer;
+} grn_stem_token_filter;
+
+static void *
+stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
+{
+ grn_stem_token_filter *token_filter;
+
+ token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter));
+ if (!token_filter) {
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[token-filter][stem] "
+ "failed to allocate grn_stem_token_filter");
+ return NULL;
+ }
+
+ {
+ /* TODO: Support other languages. */
+ const char *algorithm = "english";
+ const char *encoding = "UTF_8";
+ token_filter->stemmer = sb_stemmer_new(algorithm, encoding);
+ if (!token_filter->stemmer) {
+ GRN_PLUGIN_FREE(ctx, token_filter);
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[token-filter][stem] "
+ "failed to create stemmer: "
+ "algorithm=<%s>, encoding=<%s>",
+ algorithm, encoding);
+ return NULL;
+ }
+ }
+ grn_tokenizer_token_init(ctx, &(token_filter->token));
+ GRN_TEXT_INIT(&(token_filter->buffer), 0);
+
+ return token_filter;
+}
+
+static grn_bool
+is_stemmable(grn_obj *data, grn_bool *is_all_upper)
+{
+ const char *current, *end;
+ grn_bool have_lower = GRN_FALSE;
+ grn_bool have_upper = GRN_FALSE;
+
+ *is_all_upper = GRN_FALSE;
+
+ switch (data->header.domain) {
+ case GRN_DB_SHORT_TEXT :
+ case GRN_DB_TEXT :
+ case GRN_DB_LONG_TEXT :
+ break;
+ default :
+ return GRN_FALSE;
+ }
+
+ current = GRN_TEXT_VALUE(data);
+ end = current + GRN_TEXT_LEN(data);
+
+ for (; current < end; current++) {
+ if (islower((unsigned char)*current)) {
+ have_lower = GRN_TRUE;
+ continue;
+ }
+ if (isupper((unsigned char)*current)) {
+ have_upper = GRN_TRUE;
+ continue;
+ }
+ if (isdigit((unsigned char)*current)) {
+ continue;
+ }
+ switch (*current) {
+ case '-' :
+ case '\'' :
+ break;
+ default :
+ return GRN_FALSE;
+ }
+ }
+
+ if (!have_lower && have_upper) {
+ *is_all_upper = GRN_TRUE;
+ }
+
+ return GRN_TRUE;
+}
+
+static void
+normalize(grn_ctx *ctx,
+ const char *string, unsigned int length,
+ grn_obj *normalized)
+{
+ const char *current, *end;
+ const char *unwritten;
+
+ current = unwritten = string;
+ end = current + length;
+
+ for (; current < end; current++) {
+ if (isupper((unsigned char)*current)) {
+ if (current > unwritten) {
+ GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+ }
+ GRN_TEXT_PUTC(ctx, normalized, tolower((unsigned char)*current));
+ unwritten = current + 1;
+ }
+ }
+
+ if (current != unwritten) {
+ GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+ }
+}
+
+static void
+unnormalize(grn_ctx *ctx,
+ const char *string, unsigned int length,
+ grn_obj *normalized)
+{
+ const char *current, *end;
+ const char *unwritten;
+
+ current = unwritten = string;
+ end = current + length;
+
+ for (; current < end; current++) {
+ if (islower((unsigned char)*current)) {
+ if (current > unwritten) {
+ GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+ }
+ GRN_TEXT_PUTC(ctx, normalized, toupper((unsigned char)*current));
+ unwritten = current + 1;
+ }
+ }
+
+ if (current != unwritten) {
+ GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten);
+ }
+}
+
+static void
+stem_filter(grn_ctx *ctx,
+ grn_token *current_token,
+ grn_token *next_token,
+ void *user_data)
+{
+ grn_stem_token_filter *token_filter = user_data;
+ grn_obj *data;
+ grn_bool is_all_upper = GRN_FALSE;
+
+ if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) {
+ return;
+ }
+
+ data = grn_token_get_data(ctx, current_token);
+ if (!is_stemmable(data, &is_all_upper)) {
+ return;
+ }
+
+ {
+ const sb_symbol *stemmed;
+
+ if (is_all_upper) {
+ grn_obj *buffer;
+ buffer = &(token_filter->buffer);
+ GRN_BULK_REWIND(buffer);
+ normalize(ctx,
+ GRN_TEXT_VALUE(data),
+ GRN_TEXT_LEN(data),
+ buffer);
+ stemmed = sb_stemmer_stem(token_filter->stemmer,
+ GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer));
+ if (stemmed) {
+ GRN_BULK_REWIND(buffer);
+ unnormalize(ctx,
+ stemmed,
+ sb_stemmer_length(token_filter->stemmer),
+ buffer);
+ grn_token_set_data(ctx, next_token,
+ GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer));
+ } else {
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[token-filter][stem] "
+ "failed to allocate memory for stemmed word: <%.*s> "
+ "(normalized: <%.*s>)",
+ (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data),
+ (int)GRN_TEXT_LEN(buffer), GRN_TEXT_VALUE(buffer));
+ }
+ } else {
+ stemmed = sb_stemmer_stem(token_filter->stemmer,
+ GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data));
+ if (stemmed) {
+ grn_token_set_data(ctx, next_token,
+ stemmed,
+ sb_stemmer_length(token_filter->stemmer));
+ } else {
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[token-filter][stem] "
+ "failed to allocate memory for stemmed word: <%.*s>",
+ (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data));
+ }
+ }
+ }
+}
+
+static void
+stem_fin(grn_ctx *ctx, void *user_data)
+{
+ grn_stem_token_filter *token_filter = user_data;
+ if (!token_filter) {
+ return;
+ }
+
+ grn_tokenizer_token_fin(ctx, &(token_filter->token));
+ if (token_filter->stemmer) {
+ sb_stemmer_delete(token_filter->stemmer);
+ }
+ GRN_OBJ_FIN(ctx, &(token_filter->buffer));
+ GRN_PLUGIN_FREE(ctx, token_filter);
+}
+
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+ return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+ grn_rc rc;
+
+ rc = grn_token_filter_register(ctx,
+ "TokenFilterStem", -1,
+ stem_init,
+ stem_filter,
+ stem_fin);
+
+ return rc;
+}
+
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+ return GRN_SUCCESS;
+}
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am b/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am
new file mode 100644
index 00000000..d02a3952
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am
@@ -0,0 +1,2 @@
+stem_la_SOURCES = \
+ stem.c
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c
new file mode 100644
index 00000000..a06d772f
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c
@@ -0,0 +1,159 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2014 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*/
+
+#ifdef GRN_EMBEDDED
+# define GRN_PLUGIN_FUNCTION_TAG token_filters_stop_word
+#endif
+
+#include <grn_str.h>
+
+#include <groonga.h>
+#include <groonga/token_filter.h>
+
+#include <string.h>
+
+#define COLUMN_NAME "is_stop_word"
+
+typedef struct {
+ grn_obj *table;
+ grn_token_mode mode;
+ grn_obj *column;
+ grn_obj value;
+ grn_tokenizer_token token;
+} grn_stop_word_token_filter;
+
+static void *
+stop_word_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
+{
+ grn_stop_word_token_filter *token_filter;
+
+ if (mode != GRN_TOKEN_GET) {
+ return NULL;
+ }
+
+ token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stop_word_token_filter));
+ if (!token_filter) {
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[token-filter][stop-word] "
+ "failed to allocate grn_stop_word_token_filter");
+ return NULL;
+ }
+
+ token_filter->table = table;
+ token_filter->mode = mode;
+ token_filter->column = grn_obj_column(ctx,
+ token_filter->table,
+ COLUMN_NAME,
+ strlen(COLUMN_NAME));
+ if (!token_filter->column) {
+ char table_name[GRN_TABLE_MAX_KEY_SIZE];
+ unsigned int table_name_size;
+
+ table_name_size = grn_obj_name(ctx,
+ token_filter->table,
+ table_name,
+ GRN_TABLE_MAX_KEY_SIZE);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR,
+ "[token-filter][stop-word] "
+ "column for judging stop word doesn't exit: <%.*s.%s>",
+ table_name_size,
+ table_name,
+ COLUMN_NAME);
+ GRN_PLUGIN_FREE(ctx, token_filter);
+ return NULL;
+ }
+
+ GRN_BOOL_INIT(&(token_filter->value), 0);
+ grn_tokenizer_token_init(ctx, &(token_filter->token));
+
+ return token_filter;
+}
+
+static void
+stop_word_filter(grn_ctx *ctx,
+ grn_token *current_token,
+ grn_token *next_token,
+ void *user_data)
+{
+ grn_stop_word_token_filter *token_filter = user_data;
+ grn_id id;
+ grn_obj *data;
+
+ if (!token_filter) {
+ return;
+ }
+
+ data = grn_token_get_data(ctx, current_token);
+ id = grn_table_get(ctx,
+ token_filter->table,
+ GRN_TEXT_VALUE(data),
+ GRN_TEXT_LEN(data));
+ if (id != GRN_ID_NIL) {
+ GRN_BULK_REWIND(&(token_filter->value));
+ grn_obj_get_value(ctx,
+ token_filter->column,
+ id,
+ &(token_filter->value));
+ if (GRN_BOOL_VALUE(&(token_filter->value))) {
+ grn_tokenizer_status status;
+ status = grn_token_get_status(ctx, current_token);
+ status |= GRN_TOKEN_SKIP;
+ grn_token_set_status(ctx, next_token, status);
+ }
+ }
+}
+
+static void
+stop_word_fin(grn_ctx *ctx, void *user_data)
+{
+ grn_stop_word_token_filter *token_filter = user_data;
+ if (!token_filter) {
+ return;
+ }
+
+ grn_tokenizer_token_fin(ctx, &(token_filter->token));
+ grn_obj_unlink(ctx, token_filter->column);
+ grn_obj_unlink(ctx, &(token_filter->value));
+ GRN_PLUGIN_FREE(ctx, token_filter);
+}
+
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+ return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+ grn_rc rc;
+
+ rc = grn_token_filter_register(ctx,
+ "TokenFilterStopWord", -1,
+ stop_word_init,
+ stop_word_filter,
+ stop_word_fin);
+
+ return rc;
+}
+
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+ return GRN_SUCCESS;
+}
diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am
new file mode 100644
index 00000000..bab89551
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am
@@ -0,0 +1,2 @@
+stop_word_la_SOURCES = \
+ stop_word.c