summaryrefslogtreecommitdiffstats
path: root/storage/mroonga/vendor/groonga/plugins/tokenizers
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:07:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 18:07:14 +0000
commita175314c3e5827eb193872241446f2f8f5c9d33c (patch)
treecd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/mroonga/vendor/groonga/plugins/tokenizers
parentInitial commit. (diff)
downloadmariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.tar.xz
mariadb-10.5-a175314c3e5827eb193872241446f2f8f5c9d33c.zip
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins/tokenizers')
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt76
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am33
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp358
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am2
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c660
-rw-r--r--storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am2
6 files changed, 1131 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt
new file mode 100644
index 00000000..26aadc4e
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright(C) 2012-2013 Brazil
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1 as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+include_directories(
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../lib
+ )
+
+set(TOKENIZERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/tokenizers")
+if(GRN_WITH_MECAB)
+ read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/mecab_sources.am MECAB_SOURCES)
+ include_directories(${MECAB_INCLUDE_DIRS})
+ link_directories(${MECAB_LIBRARY_DIRS})
+ if(GRN_WITH_BUNDLED_MECAB)
+ set(GRN_BUNDLED_MECAB_RELATIVE_RC_PATH "${CONFIG_DIR}/mecabrc")
+ set(MECAB_COMPILE_DEFINITIONS
+ "GRN_WITH_BUNDLED_MECAB"
+ "GRN_BUNDLED_MECAB_RELATIVE_RC_PATH=\"${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\""
+ "GRN_BUNDLED_MECAB_RC_PATH=\"${CMAKE_INSTALL_PREFIX}/${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\"")
+ set_source_files_properties(${MECAB_SOURCES}
+ PROPERTIES
+ COMPILE_DEFINITIONS
+ "${MECAB_COMPILE_DEFINITIONS}")
+ endif()
+ set_source_files_properties(${MECAB_SOURCES}
+ PROPERTIES
+ COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}")
+ if(GRN_EMBED)
+ add_library(mecab_tokenizer STATIC ${MECAB_SOURCES})
+ set_target_properties(
+ mecab_tokenizer
+ PROPERTIES
+ POSITION_INDEPENDENT_CODE ON)
+ else()
+ add_library(mecab_tokenizer MODULE ${MECAB_SOURCES})
+ set_target_properties(mecab_tokenizer PROPERTIES
+ PREFIX ""
+ OUTPUT_NAME "mecab")
+ install(TARGETS mecab_tokenizer DESTINATION "${TOKENIZERS_DIR}")
+ endif()
+ target_link_libraries(mecab_tokenizer libgroonga ${MECAB_LIBRARIES})
+endif()
+
+if(GRN_WITH_KYTEA)
+ read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/kytea_sources.am KYTEA_SOURCES)
+ include_directories(${KYTEA_INCLUDE_DIRS})
+ link_directories(${KYTEA_LIBRARY_DIRS})
+ set_source_files_properties(${KYTEA_SOURCES}
+ PROPERTIES
+ COMPILE_FLAGS "${GRN_CXX_COMPILE_FLAGS}")
+ if(GRN_EMBED)
+ add_library(kytea_tokenizer STATIC ${KYTEA_SOURCES})
+ set_target_properties(
+ kytea_tokenizer
+ PROPERTIES
+ POSITION_INDEPENDENT_CODE ON)
+ else()
+ add_library(kytea_tokenizer MODULE ${KYTEA_SOURCES})
+ set_target_properties(kytea_tokenizer PROPERTIES
+ PREFIX ""
+ OUTPUT_NAME "kytea")
+ install(TARGETS kytea_tokenizer DESTINATION "${TOKENIZERS_DIR}")
+ endif()
+ target_link_libraries(kytea_tokenizer libgroonga ${KYTEA_LIBRARIES})
+endif()
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am
new file mode 100644
index 00000000..9e10612b
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am
@@ -0,0 +1,33 @@
+EXTRA_DIST = \
+ CMakeLists.txt
+
+AM_CPPFLAGS = \
+ -I$(top_builddir) \
+ -I$(top_srcdir)/include \
+ -I$(top_srcdir)/lib
+
+AM_LDFLAGS = \
+ -avoid-version \
+ -module \
+ -no-undefined
+
+LIBS = \
+ $(top_builddir)/lib/libgroonga.la
+
+tokenizer_plugins_LTLIBRARIES =
+if WITH_MECAB
+tokenizer_plugins_LTLIBRARIES += mecab.la
+endif
+if WITH_KYTEA
+tokenizer_plugins_LTLIBRARIES += kytea.la
+endif
+
+include mecab_sources.am
+mecab_la_CPPFLAGS = $(AM_CPPFLAGS) $(MECAB_CPPFLAGS)
+mecab_la_LIBADD = $(LIBS) $(MECAB_LIBS)
+mecab_la_LDFLAGS = $(AM_LDFLAGS) $(MECAB_LDFLAGS)
+
+include kytea_sources.am
+kytea_la_CPPFLAGS = $(AM_CPPFLAGS) $(KYTEA_CFLAGS)
+kytea_la_LIBADD = $(LIBS) $(KYTEA_LIBS)
+kytea_la_LDFLAGS = $(AM_LDFLAGS) $(KYTEA_LDFLAGS)
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp
new file mode 100644
index 00000000..76d827c0
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp
@@ -0,0 +1,358 @@
+/* -*- c-basic-offset: 2 -*- */
+/* Copyright(C) 2012 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*/
+
+#ifdef GRN_EMBEDDED
+# define GRN_PLUGIN_FUNCTION_TAG tokenizers_kytea
+#endif
+
+#include <groonga/tokenizer.h>
+
+#include <kytea/kytea.h>
+#include <kytea/string-util.h>
+
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+namespace {
+
+grn_plugin_mutex *kytea_mutex = NULL;
+kytea::KyteaConfig *kytea_config = NULL;
+kytea::Kytea *kytea_tagger = NULL;
+kytea::StringUtil *kytea_util = NULL;
+
+void kytea_init(grn_ctx *ctx);
+void kytea_fin(grn_ctx *ctx);
+
+void kytea_init(grn_ctx *ctx) {
+ if (kytea_mutex || kytea_config || kytea_tagger || kytea_util) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "TokenKytea is already initialized");
+ return;
+ }
+
+ kytea_mutex = grn_plugin_mutex_open(ctx);
+ if (!kytea_mutex) {
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[tokenizer][kytea] "
+ "grn_plugin_mutex_open() failed");
+ return;
+ }
+
+ kytea::KyteaConfig * const config = static_cast<kytea::KyteaConfig *>(
+ GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig)));
+ if (!config) {
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[tokenizer][kytea] "
+ "memory allocation to kytea::KyteaConfig failed");
+ return;
+ }
+
+ try {
+ new (config) kytea::KyteaConfig;
+ kytea_config = config;
+ try {
+ kytea_config->setDebug(0);
+ kytea_config->setOnTraining(false);
+ kytea_config->parseRunCommandLine(0, NULL);
+ } catch (...) {
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "kytea::KyteaConfig settings failed");
+ return;
+ }
+ } catch (...) {
+ GRN_PLUGIN_FREE(ctx, config);
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "kytea::KyteaConfig initialization failed");
+ return;
+ }
+
+ kytea::Kytea * const tagger = static_cast<kytea::Kytea *>(
+ GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea)));
+ if (!tagger) {
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[tokenizer][kytea] "
+ "memory allocation to kytea::Kytea failed");
+ return;
+ }
+
+ try {
+ new (tagger) kytea::Kytea;
+ kytea_tagger = tagger;
+ try {
+ kytea_tagger->readModel(kytea_config->getModelFile().c_str());
+ } catch (...) {
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "kytea::Kytea::readModel() failed");
+ return;
+ }
+ } catch (...) {
+ GRN_PLUGIN_FREE(ctx, tagger);
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "kytea::Kytea initialization failed");
+ return;
+ }
+
+ try {
+ kytea_util = kytea_tagger->getStringUtil();
+ } catch (...) {
+ kytea_fin(ctx);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "kytea::Kytea::getStringUtil() failed");
+ return;
+ }
+}
+
+void kytea_fin(grn_ctx *ctx) {
+ kytea_util = NULL;
+
+ if (kytea_tagger) {
+ kytea_tagger->~Kytea();
+ GRN_PLUGIN_FREE(ctx, kytea_tagger);
+ kytea_tagger = NULL;
+ }
+
+ if (kytea_config) {
+ kytea_config->~KyteaConfig();
+ GRN_PLUGIN_FREE(ctx, kytea_config);
+ kytea_config = NULL;
+ }
+
+ if (kytea_mutex) {
+ grn_plugin_mutex_close(ctx, kytea_mutex);
+ kytea_mutex = NULL;
+ }
+}
+
+struct grn_tokenizer_kytea {
+ grn_tokenizer_query *query;
+ kytea::KyteaSentence sentence;
+ std::vector<std::string> tokens;
+ std::size_t id;
+ grn_tokenizer_token token;
+ const char *rest_query_string;
+ unsigned int rest_query_string_length;
+
+ grn_tokenizer_kytea() :
+ query(NULL),
+ sentence(),
+ tokens(),
+ id(0),
+ token(),
+ rest_query_string(NULL)
+ {
+ }
+ ~grn_tokenizer_kytea() {}
+};
+
+void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
+ new (tokenizer) grn_tokenizer_kytea;
+ grn_tokenizer_token_init(ctx, &tokenizer->token);
+}
+
+void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) {
+ grn_tokenizer_token_fin(ctx, &tokenizer->token);
+ if (tokenizer->query) {
+ grn_tokenizer_query_close(ctx, tokenizer->query);
+ }
+ tokenizer->~grn_tokenizer_kytea();
+}
+
+grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args,
+ grn_user_data *user_data) {
+ unsigned int normalizer_flags = 0;
+ grn_tokenizer_query * const query =
+ grn_tokenizer_query_open(ctx, num_args, args, normalizer_flags);
+ if (!query) {
+ return NULL;
+ }
+
+ grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>(
+ GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea)));
+ if (!tokenizer) {
+ grn_tokenizer_query_close(ctx, query);
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[tokenizer][kytea] "
+ "memory allocation to grn_tokenizer_kytea failed");
+ return NULL;
+ }
+
+ try {
+ grn_tokenizer_kytea_init(ctx, tokenizer);
+ } catch (...) {
+ grn_tokenizer_query_close(ctx, query);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "tokenizer initialization failed");
+ return NULL;
+ }
+
+ tokenizer->query = query;
+
+ grn_obj *normalized_query = query->normalized_query;
+ const char *normalized_string;
+ unsigned int normalized_string_length;
+ grn_string_get_normalized(ctx,
+ normalized_query,
+ &normalized_string,
+ &normalized_string_length,
+ NULL);
+ if (tokenizer->query->have_tokenized_delimiter) {
+ tokenizer->rest_query_string = normalized_string;
+ tokenizer->rest_query_string_length = normalized_string_length;
+ } else {
+ grn_plugin_mutex_lock(ctx, kytea_mutex);
+ try {
+ const std::string str(normalized_string, normalized_string_length);
+ const kytea::KyteaString &surface_str = kytea_util->mapString(str);
+ const kytea::KyteaString &normalized_str = kytea_util->normalize(surface_str);
+ tokenizer->sentence = kytea::KyteaSentence(surface_str, normalized_str);
+ kytea_tagger->calculateWS(tokenizer->sentence);
+ } catch (...) {
+ grn_plugin_mutex_unlock(ctx, kytea_mutex);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "tokenization failed");
+ return NULL;
+ }
+ grn_plugin_mutex_unlock(ctx, kytea_mutex);
+
+ try {
+ for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) {
+ const std::string &token =
+ kytea_util->showString(tokenizer->sentence.words[i].surface);
+ const char *ptr = token.c_str();
+ unsigned int left = static_cast<unsigned int>(token.length());
+ while (left > 0) {
+ const int char_length =
+ grn_tokenizer_charlen(ctx, ptr, left, query->encoding);
+ if ((char_length == 0) ||
+ (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) {
+ break;
+ }
+ ptr += char_length;
+ left -= char_length;
+ }
+ if (left == 0) {
+ tokenizer->tokens.push_back(token);
+ }
+ }
+ } catch (...) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][kytea] "
+ "adjustment failed");
+ return NULL;
+ }
+ }
+
+ user_data->ptr = tokenizer;
+ return NULL;
+}
+
+grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args,
+ grn_user_data *user_data) {
+ grn_tokenizer_kytea * const tokenizer =
+ static_cast<grn_tokenizer_kytea *>(user_data->ptr);
+
+ if (tokenizer->query->have_tokenized_delimiter) {
+ unsigned int rest_query_string_length =
+ tokenizer->rest_query_string_length;
+ const char *rest_query_string =
+ grn_tokenizer_tokenized_delimiter_next(ctx,
+ &(tokenizer->token),
+ tokenizer->rest_query_string,
+ rest_query_string_length,
+ tokenizer->query->encoding);
+ if (rest_query_string) {
+ tokenizer->rest_query_string_length -=
+ rest_query_string - tokenizer->rest_query_string;
+ }
+ tokenizer->rest_query_string = rest_query_string;
+ } else {
+ const grn_tokenizer_status status =
+ ((tokenizer->id + 1) < tokenizer->tokens.size()) ?
+ GRN_TOKENIZER_CONTINUE : GRN_TOKENIZER_LAST;
+ if (tokenizer->id < tokenizer->tokens.size()) {
+ const std::string &token = tokenizer->tokens[tokenizer->id++];
+ grn_tokenizer_token_push(ctx, &tokenizer->token,
+ token.c_str(), token.length(), status);
+ } else {
+ grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status);
+ }
+ }
+
+ return NULL;
+}
+
+grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args,
+ grn_user_data *user_data) {
+ grn_tokenizer_kytea * const tokenizer =
+ static_cast<grn_tokenizer_kytea *>(user_data->ptr);
+ if (tokenizer) {
+ grn_tokenizer_kytea_fin(ctx, tokenizer);
+ GRN_PLUGIN_FREE(ctx, tokenizer);
+ }
+ return NULL;
+}
+
+} // namespace
+
+extern "C" {
+
+/*
+ GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error
+ code must be set in `ctx->rc' on failure.
+ */
+grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) {
+ kytea_init(ctx);
+ return ctx->rc;
+}
+
+/*
+ GRN_PLUGIN_REGISTER() registers this plugin to the database associated with
+ `ctx'. The registration requires the plugin name and the functions to be
+ called for tokenization.
+ */
+grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) {
+ return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init,
+ grn_kytea_next, grn_kytea_fin);
+}
+
+/*
+ GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by
+ GRN_PLUGIN_INIT().
+ */
+grn_rc GRN_PLUGIN_FIN(grn_ctx *ctx) {
+ kytea_fin(ctx);
+ return GRN_SUCCESS;
+}
+
+} // extern "C"
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am
new file mode 100644
index 00000000..182f3857
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am
@@ -0,0 +1,2 @@
+kytea_la_SOURCES = \
+ kytea.cpp
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
new file mode 100644
index 00000000..cabf2c94
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c
@@ -0,0 +1,660 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2009-2016 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*/
+
+#ifdef GRN_EMBEDDED
+# define GRN_PLUGIN_FUNCTION_TAG tokenizers_mecab
+#endif
+
+#include <grn_str.h>
+
+#include <groonga.h>
+#include <groonga/tokenizer.h>
+
+#include <mecab.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int sole_mecab_init_counter = 0;
+static mecab_t *sole_mecab = NULL;
+static grn_plugin_mutex *sole_mecab_mutex = NULL;
+static grn_encoding sole_mecab_encoding = GRN_ENC_NONE;
+
+static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE;
+static int grn_mecab_chunk_size_threshold = 8192;
+
+typedef struct {
+ mecab_t *mecab;
+ grn_obj buf;
+ const char *next;
+ const char *end;
+ grn_tokenizer_query *query;
+ grn_tokenizer_token token;
+} grn_mecab_tokenizer;
+
+static const char *
+mecab_global_error_message(void)
+{
+ double version;
+
+ version = atof(mecab_version());
+ /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */
+ if (version <= 0.993) {
+ return "Unknown";
+ }
+
+ return mecab_strerror(NULL);
+}
+
+
+static grn_encoding
+translate_mecab_charset_to_grn_encoding(const char *charset)
+{
+ if (grn_strcasecmp(charset, "euc-jp") == 0) {
+ return GRN_ENC_EUC_JP;
+ } else if (grn_strcasecmp(charset, "utf-8") == 0 ||
+ grn_strcasecmp(charset, "utf8") == 0) {
+ return GRN_ENC_UTF8;
+ } else if (grn_strcasecmp(charset, "shift_jis") == 0 ||
+ grn_strcasecmp(charset, "shift-jis") == 0 ||
+ grn_strcasecmp(charset, "sjis") == 0) {
+ return GRN_ENC_SJIS;
+ }
+ return GRN_ENC_NONE;
+}
+
+static grn_encoding
+get_mecab_encoding(mecab_t *mecab)
+{
+ grn_encoding encoding = GRN_ENC_NONE;
+ const mecab_dictionary_info_t *dictionary_info;
+ dictionary_info = mecab_dictionary_info(mecab);
+ if (dictionary_info) {
+ const char *charset = dictionary_info->charset;
+ encoding = translate_mecab_charset_to_grn_encoding(charset);
+ }
+ return encoding;
+}
+
+static inline grn_bool
+is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes)
+{
+ switch (character_bytes) {
+ case 1 :
+ switch (character[0]) {
+ case ',' :
+ case '.' :
+ case '!' :
+ case '?' :
+ return GRN_TRUE;
+ default :
+ return GRN_FALSE;
+ }
+ case 3 :
+ switch ((unsigned char)(character[0])) {
+ case 0xE3 :
+ switch ((unsigned char)(character[1])) {
+ case 0x80 :
+ switch ((unsigned char)(character[2])) {
+ case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */
+ case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */
+ return GRN_TRUE;
+ default :
+ return GRN_FALSE;
+ }
+ default :
+ return GRN_FALSE;
+ }
+ return GRN_FALSE;
+ case 0xEF :
+ switch ((unsigned char)(character[1])) {
+ case 0xBC :
+ switch ((unsigned char)(character[2])) {
+ case 0x81 :
+ /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */
+ case 0x9F :
+ /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */
+ return GRN_TRUE;
+ default :
+ return GRN_FALSE;
+ }
+ default :
+ return GRN_FALSE;
+ }
+ return GRN_FALSE;
+ default :
+ return GRN_FALSE;
+ }
+ default :
+ return GRN_FALSE;
+ }
+}
+
+static grn_bool
+chunked_tokenize_utf8_chunk(grn_ctx *ctx,
+ grn_mecab_tokenizer *tokenizer,
+ const char *chunk,
+ unsigned int chunk_bytes)
+{
+ const char *tokenized_chunk;
+ size_t tokenized_chunk_length;
+
+ tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes);
+ if (!tokenized_chunk) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab][chunk] "
+ "mecab_sparse_tostr2() failed len=%d err=%s",
+ chunk_bytes,
+ mecab_strerror(tokenizer->mecab));
+ return GRN_FALSE;
+ }
+
+ if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) {
+ GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " ");
+ }
+
+ tokenized_chunk_length = strlen(tokenized_chunk);
+ if (tokenized_chunk_length >= 1 &&
+ isspace((unsigned char)tokenized_chunk[tokenized_chunk_length - 1])) {
+ GRN_TEXT_PUT(ctx, &(tokenizer->buf),
+ tokenized_chunk, tokenized_chunk_length - 1);
+ } else {
+ GRN_TEXT_PUT(ctx, &(tokenizer->buf),
+ tokenized_chunk, tokenized_chunk_length);
+ }
+
+ return GRN_TRUE;
+}
+
+static grn_bool
+chunked_tokenize_utf8(grn_ctx *ctx,
+ grn_mecab_tokenizer *tokenizer,
+ const char *string,
+ unsigned int string_bytes)
+{
+ const char *chunk_start;
+ const char *current;
+ const char *last_delimiter;
+ const char *string_end = string + string_bytes;
+ grn_encoding encoding = tokenizer->query->encoding;
+
+ if (string_bytes < grn_mecab_chunk_size_threshold) {
+ return chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ string,
+ string_bytes);
+ }
+
+ chunk_start = current = string;
+ last_delimiter = NULL;
+ while (current < string_end) {
+ int space_bytes;
+ int character_bytes;
+ const char *current_character;
+
+ space_bytes = grn_isspace(current, encoding);
+ if (space_bytes > 0) {
+ if (chunk_start != current) {
+ grn_bool succeeded;
+ succeeded = chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ current - chunk_start);
+ if (!succeeded) {
+ return succeeded;
+ }
+ }
+ current += space_bytes;
+ chunk_start = current;
+ last_delimiter = NULL;
+ continue;
+ }
+
+ character_bytes = grn_charlen_(ctx, current, string_end, encoding);
+ if (character_bytes == 0) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab][chunk] "
+ "invalid byte sequence: position=%d",
+ (int)(current - string));
+ return GRN_FALSE;
+ }
+
+ current_character = current;
+ current += character_bytes;
+ if (is_delimiter_character(ctx, current_character, character_bytes)) {
+ last_delimiter = current;
+ }
+
+ if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) {
+ grn_bool succeeded;
+ if (last_delimiter) {
+ succeeded = chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ last_delimiter - chunk_start);
+ chunk_start = last_delimiter;
+ } else {
+ succeeded = chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ current - chunk_start);
+ chunk_start = current;
+ }
+ if (!succeeded) {
+ return succeeded;
+ }
+ last_delimiter = NULL;
+ }
+ }
+
+ if (current == chunk_start) {
+ return GRN_TRUE;
+ } else {
+ return chunked_tokenize_utf8_chunk(ctx,
+ tokenizer,
+ chunk_start,
+ current - chunk_start);
+ }
+}
+
+static mecab_t *
+mecab_create(grn_ctx *ctx)
+{
+ mecab_t *mecab;
+ int argc = 0;
+ const char *argv[4];
+
+ argv[argc++] = "Groonga";
+ argv[argc++] = "-Owakati";
+#ifdef GRN_WITH_BUNDLED_MECAB
+ argv[argc++] = "--rcfile";
+# ifdef WIN32
+ {
+ static char windows_mecab_rc_file[PATH_MAX];
+
+ grn_strcpy(windows_mecab_rc_file,
+ PATH_MAX,
+ grn_plugin_windows_base_dir());
+ grn_strcat(windows_mecab_rc_file,
+ PATH_MAX,
+ "/");
+ grn_strcat(windows_mecab_rc_file,
+ PATH_MAX,
+ GRN_BUNDLED_MECAB_RELATIVE_RC_PATH);
+ {
+ char *c;
+ for (c = windows_mecab_rc_file; *c != '\0'; c++) {
+ if (*c == '/') {
+ *c = '\\';
+ }
+ }
+ }
+ argv[argc++] = windows_mecab_rc_file;
+ }
+# else /* WIN32 */
+ argv[argc++] = GRN_BUNDLED_MECAB_RC_PATH;
+# endif /* WIN32 */
+#endif /* GRN_WITH_BUNDLED_MECAB */
+ mecab = mecab_new(argc, (char **)argv);
+
+ if (!mecab) {
+#ifdef GRN_WITH_BUNDLED_MECAB
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] failed to create mecab_t: %s: "
+ "mecab_new(\"%s\", \"%s\", \"%s\", \"%s\")",
+ mecab_global_error_message(),
+ argv[0], argv[1], argv[2], argv[3]);
+#else /* GRN_WITH_BUNDLED_MECAB */
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] failed to create mecab_t: %s: "
+ "mecab_new(\"%s\", \"%s\")",
+ mecab_global_error_message(),
+ argv[0], argv[1]);
+#endif /* GRN_WITH_BUNDLED_MECAB */
+ }
+
+ return mecab;
+}
+
+/*
+ This function is called for a full text search query or a document to be
+ indexed. This means that both short/long strings are given.
+ The return value of this function is ignored. When an error occurs in this
+ function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS).
+ */
+static grn_obj *
+mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ grn_mecab_tokenizer *tokenizer;
+ unsigned int normalizer_flags = 0;
+ grn_tokenizer_query *query;
+ grn_obj *normalized_query;
+ const char *normalized_string;
+ unsigned int normalized_string_length;
+
+ query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags);
+ if (!query) {
+ return NULL;
+ }
+ if (!sole_mecab) {
+ grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
+ if (!sole_mecab) {
+ sole_mecab = mecab_create(ctx);
+ if (sole_mecab) {
+ sole_mecab_encoding = get_mecab_encoding(sole_mecab);
+ }
+ }
+ grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
+ }
+ if (!sole_mecab) {
+ grn_tokenizer_query_close(ctx, query);
+ return NULL;
+ }
+
+ if (query->encoding != sole_mecab_encoding) {
+ grn_tokenizer_query_close(ctx, query);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] "
+ "MeCab dictionary charset (%s) does not match "
+ "the table encoding: <%s>",
+ grn_encoding_to_string(sole_mecab_encoding),
+ grn_encoding_to_string(query->encoding));
+ return NULL;
+ }
+
+ if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) {
+ grn_tokenizer_query_close(ctx, query);
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[tokenizer][mecab] "
+ "memory allocation to grn_mecab_tokenizer failed");
+ return NULL;
+ }
+ tokenizer->mecab = sole_mecab;
+ tokenizer->query = query;
+
+ normalized_query = query->normalized_query;
+ grn_string_get_normalized(ctx,
+ normalized_query,
+ &normalized_string,
+ &normalized_string_length,
+ NULL);
+ GRN_TEXT_INIT(&(tokenizer->buf), 0);
+ if (query->have_tokenized_delimiter) {
+ tokenizer->next = normalized_string;
+ tokenizer->end = tokenizer->next + normalized_string_length;
+ } else if (normalized_string_length == 0) {
+ tokenizer->next = "";
+ tokenizer->end = tokenizer->next;
+ } else {
+ grn_bool succeeded;
+ grn_plugin_mutex_lock(ctx, sole_mecab_mutex);
+ if (grn_mecab_chunked_tokenize_enabled &&
+ ctx->encoding == GRN_ENC_UTF8) {
+ succeeded = chunked_tokenize_utf8(ctx,
+ tokenizer,
+ normalized_string,
+ normalized_string_length);
+ } else {
+ const char *s;
+ s = mecab_sparse_tostr2(tokenizer->mecab,
+ normalized_string,
+ normalized_string_length);
+ if (!s) {
+ succeeded = GRN_FALSE;
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] "
+ "mecab_sparse_tostr() failed len=%d err=%s",
+ normalized_string_length,
+ mecab_strerror(tokenizer->mecab));
+ } else {
+ succeeded = GRN_TRUE;
+ GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s);
+ }
+ }
+ grn_plugin_mutex_unlock(ctx, sole_mecab_mutex);
+ if (!succeeded) {
+ grn_tokenizer_query_close(ctx, tokenizer->query);
+ GRN_PLUGIN_FREE(ctx, tokenizer);
+ return NULL;
+ }
+ {
+ char *buf, *p;
+ unsigned int bufsize;
+
+ buf = GRN_TEXT_VALUE(&(tokenizer->buf));
+ bufsize = GRN_TEXT_LEN(&(tokenizer->buf));
+ /* A certain version of mecab returns trailing lf or spaces. */
+ for (p = buf + bufsize - 2;
+ buf <= p && isspace(*(unsigned char *)p);
+ p--) { *p = '\0'; }
+ tokenizer->next = buf;
+ tokenizer->end = p + 1;
+ }
+ }
+ user_data->ptr = tokenizer;
+
+ grn_tokenizer_token_init(ctx, &(tokenizer->token));
+
+ return NULL;
+}
+
+/*
+ This function returns tokens one by one.
+ */
+static grn_obj *
+mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ /* grn_obj *table = args[0]; */
+ grn_mecab_tokenizer *tokenizer = user_data->ptr;
+ grn_encoding encoding = tokenizer->query->encoding;
+
+ if (tokenizer->query->have_tokenized_delimiter) {
+ tokenizer->next =
+ grn_tokenizer_tokenized_delimiter_next(ctx,
+ &(tokenizer->token),
+ tokenizer->next,
+ tokenizer->end - tokenizer->next,
+ encoding);
+ } else {
+ size_t cl;
+ const char *p = tokenizer->next, *r;
+ const char *e = tokenizer->end;
+ grn_tokenizer_status status;
+
+ for (r = p; r < e; r += cl) {
+ int space_len;
+
+ space_len = grn_isspace(r, encoding);
+ if (space_len > 0 && r == p) {
+ cl = space_len;
+ p = r + cl;
+ continue;
+ }
+
+ if (!(cl = grn_charlen_(ctx, r, e, encoding))) {
+ tokenizer->next = e;
+ break;
+ }
+
+ if (space_len > 0) {
+ const char *q = r + space_len;
+ while (q < e && (space_len = grn_isspace(q, encoding))) {
+ q += space_len;
+ }
+ tokenizer->next = q;
+ break;
+ }
+ }
+
+ if (r == e || tokenizer->next == e) {
+ status = GRN_TOKENIZER_LAST;
+ } else {
+ status = GRN_TOKENIZER_CONTINUE;
+ }
+ grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status);
+ }
+
+ return NULL;
+}
+
+/*
+ This function finalizes a tokenization.
+ */
+static grn_obj *
+mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ grn_mecab_tokenizer *tokenizer = user_data->ptr;
+ if (!tokenizer) {
+ return NULL;
+ }
+ grn_tokenizer_token_fin(ctx, &(tokenizer->token));
+ grn_tokenizer_query_close(ctx, tokenizer->query);
+ grn_obj_unlink(ctx, &(tokenizer->buf));
+ GRN_PLUGIN_FREE(ctx, tokenizer);
+ return NULL;
+}
+
+static void
+check_mecab_dictionary_encoding(grn_ctx *ctx)
+{
+#ifdef HAVE_MECAB_DICTIONARY_INFO_T
+ mecab_t *mecab;
+ grn_encoding encoding;
+ grn_bool have_same_encoding_dictionary;
+
+ mecab = mecab_create(ctx);
+ if (!mecab) {
+ return;
+ }
+
+ encoding = GRN_CTX_GET_ENCODING(ctx);
+ have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab));
+ mecab_destroy(mecab);
+
+ if (!have_same_encoding_dictionary) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer][mecab] "
+ "MeCab has no dictionary that uses the context encoding"
+ ": <%s>",
+ grn_encoding_to_string(encoding));
+ }
+#endif
+}
+
+/*
+ This function initializes a plugin. This function fails if there is no
+ dictionary that uses the context encoding of groonga.
+ */
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+ ++sole_mecab_init_counter;
+ if (sole_mecab_init_counter > 1)
+ {
+ return GRN_SUCCESS;
+ }
+ {
+ char env[GRN_ENV_BUFFER_SIZE];
+
+ grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED",
+ env,
+ GRN_ENV_BUFFER_SIZE);
+ grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0);
+ }
+
+ {
+ char env[GRN_ENV_BUFFER_SIZE];
+
+ grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD",
+ env,
+ GRN_ENV_BUFFER_SIZE);
+ if (env[0]) {
+ int threshold = -1;
+ const char *end;
+ const char *rest;
+
+ end = env + strlen(env);
+ threshold = grn_atoi(env, end, &rest);
+ if (end > env && end == rest) {
+ grn_mecab_chunk_size_threshold = threshold;
+ }
+ }
+ }
+
+ sole_mecab = NULL;
+ sole_mecab_mutex = grn_plugin_mutex_open(ctx);
+ if (!sole_mecab_mutex) {
+ GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+ "[tokenizer][mecab] grn_plugin_mutex_open() failed");
+ return ctx->rc;
+ }
+
+ check_mecab_dictionary_encoding(ctx);
+ if (ctx->rc != GRN_SUCCESS) {
+ grn_plugin_mutex_close(ctx, sole_mecab_mutex);
+ sole_mecab_mutex = NULL;
+ }
+
+ return ctx->rc;
+}
+
+/*
+ This function registers a plugin to a database.
+ */
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+ grn_rc rc;
+
+ rc = grn_tokenizer_register(ctx, "TokenMecab", 10,
+ mecab_init, mecab_next, mecab_fin);
+ if (rc == GRN_SUCCESS) {
+ grn_obj *token_mecab;
+ token_mecab = grn_ctx_get(ctx, "TokenMecab", 10);
+ /* Just for backward compatibility. TokenMecab was built-in not plugin. */
+ if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) {
+ rc = GRN_FILE_CORRUPT;
+ }
+ }
+
+ return rc;
+}
+
+/*
+ This function finalizes a plugin.
+ */
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+ --sole_mecab_init_counter;
+ if (sole_mecab_init_counter > 0)
+ {
+ return GRN_SUCCESS;
+ }
+ if (sole_mecab) {
+ mecab_destroy(sole_mecab);
+ sole_mecab = NULL;
+ }
+ if (sole_mecab_mutex) {
+ grn_plugin_mutex_close(ctx, sole_mecab_mutex);
+ sole_mecab_mutex = NULL;
+ }
+
+ return GRN_SUCCESS;
+}
diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am
new file mode 100644
index 00000000..56912727
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am
@@ -0,0 +1,2 @@
+mecab_la_SOURCES = \
+ mecab.c