From a175314c3e5827eb193872241446f2f8f5c9d33c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 4 May 2024 20:07:14 +0200 Subject: Adding upstream version 1:10.5.12. Signed-off-by: Daniel Baumann --- .../groonga/plugins/tokenizers/CMakeLists.txt | 76 +++ .../vendor/groonga/plugins/tokenizers/Makefile.am | 33 ++ .../vendor/groonga/plugins/tokenizers/kytea.cpp | 358 +++++++++++ .../groonga/plugins/tokenizers/kytea_sources.am | 2 + .../vendor/groonga/plugins/tokenizers/mecab.c | 660 +++++++++++++++++++++ .../groonga/plugins/tokenizers/mecab_sources.am | 2 + 6 files changed, 1131 insertions(+) create mode 100644 storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt create mode 100644 storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am create mode 100644 storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp create mode 100644 storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am create mode 100644 storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c create mode 100644 storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am (limited to 'storage/mroonga/vendor/groonga/plugins/tokenizers') diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt new file mode 100644 index 00000000..26aadc4e --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(TOKENIZERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/tokenizers") +if(GRN_WITH_MECAB) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/mecab_sources.am MECAB_SOURCES) + include_directories(${MECAB_INCLUDE_DIRS}) + link_directories(${MECAB_LIBRARY_DIRS}) + if(GRN_WITH_BUNDLED_MECAB) + set(GRN_BUNDLED_MECAB_RELATIVE_RC_PATH "${CONFIG_DIR}/mecabrc") + set(MECAB_COMPILE_DEFINITIONS + "GRN_WITH_BUNDLED_MECAB" + "GRN_BUNDLED_MECAB_RELATIVE_RC_PATH=\"${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\"" + "GRN_BUNDLED_MECAB_RC_PATH=\"${CMAKE_INSTALL_PREFIX}/${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\"") + set_source_files_properties(${MECAB_SOURCES} + PROPERTIES + COMPILE_DEFINITIONS + "${MECAB_COMPILE_DEFINITIONS}") + endif() + set_source_files_properties(${MECAB_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + if(GRN_EMBED) + add_library(mecab_tokenizer STATIC ${MECAB_SOURCES}) + set_target_properties( + mecab_tokenizer + PROPERTIES + POSITION_INDEPENDENT_CODE ON) + else() + add_library(mecab_tokenizer MODULE ${MECAB_SOURCES}) + set_target_properties(mecab_tokenizer PROPERTIES + PREFIX "" + OUTPUT_NAME "mecab") + install(TARGETS mecab_tokenizer DESTINATION "${TOKENIZERS_DIR}") + endif() + target_link_libraries(mecab_tokenizer libgroonga ${MECAB_LIBRARIES}) +endif() + +if(GRN_WITH_KYTEA) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/kytea_sources.am KYTEA_SOURCES) + include_directories(${KYTEA_INCLUDE_DIRS}) + link_directories(${KYTEA_LIBRARY_DIRS}) + set_source_files_properties(${KYTEA_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_CXX_COMPILE_FLAGS}") + if(GRN_EMBED) + add_library(kytea_tokenizer STATIC ${KYTEA_SOURCES}) + set_target_properties( + kytea_tokenizer + PROPERTIES + POSITION_INDEPENDENT_CODE ON) + else() + add_library(kytea_tokenizer MODULE ${KYTEA_SOURCES}) + set_target_properties(kytea_tokenizer PROPERTIES + PREFIX "" + OUTPUT_NAME "kytea") + install(TARGETS kytea_tokenizer DESTINATION "${TOKENIZERS_DIR}") + endif() + target_link_libraries(kytea_tokenizer libgroonga ${KYTEA_LIBRARIES}) +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am new file mode 100644 index 00000000..9e10612b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am @@ -0,0 +1,33 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +tokenizer_plugins_LTLIBRARIES = +if WITH_MECAB +tokenizer_plugins_LTLIBRARIES += mecab.la +endif +if WITH_KYTEA +tokenizer_plugins_LTLIBRARIES += kytea.la +endif + +include mecab_sources.am +mecab_la_CPPFLAGS = $(AM_CPPFLAGS) $(MECAB_CPPFLAGS) +mecab_la_LIBADD = $(LIBS) $(MECAB_LIBS) +mecab_la_LDFLAGS = $(AM_LDFLAGS) $(MECAB_LDFLAGS) + +include kytea_sources.am +kytea_la_CPPFLAGS = $(AM_CPPFLAGS) $(KYTEA_CFLAGS) +kytea_la_LIBADD = $(LIBS) $(KYTEA_LIBS) +kytea_la_LDFLAGS = $(AM_LDFLAGS) $(KYTEA_LDFLAGS) diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp new file mode 100644 index 00000000..76d827c0 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp @@ -0,0 +1,358 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG tokenizers_kytea +#endif + +#include + +#include +#include + +#include + +#include +#include + +namespace { + +grn_plugin_mutex *kytea_mutex = NULL; +kytea::KyteaConfig *kytea_config = NULL; +kytea::Kytea *kytea_tagger = NULL; +kytea::StringUtil *kytea_util = NULL; + +void kytea_init(grn_ctx *ctx); +void kytea_fin(grn_ctx *ctx); + +void kytea_init(grn_ctx *ctx) { + if (kytea_mutex || kytea_config || kytea_tagger || kytea_util) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "TokenKytea is already initialized"); + return; + } + + kytea_mutex = grn_plugin_mutex_open(ctx); + if (!kytea_mutex) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "grn_plugin_mutex_open() failed"); + return; + } + + kytea::KyteaConfig * const config = static_cast( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig))); + if (!config) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to kytea::KyteaConfig failed"); + return; + } + + try { + new (config) kytea::KyteaConfig; + kytea_config = config; + try { + kytea_config->setDebug(0); + kytea_config->setOnTraining(false); + kytea_config->parseRunCommandLine(0, NULL); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::KyteaConfig settings failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, config); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::KyteaConfig initialization failed"); + return; + } + + kytea::Kytea * const tagger = static_cast( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea))); + if (!tagger) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to kytea::Kytea failed"); + return; + } + + try { + new (tagger) kytea::Kytea; + kytea_tagger = tagger; + try { + kytea_tagger->readModel(kytea_config->getModelFile().c_str()); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea::readModel() failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, tagger); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea initialization failed"); + return; + } + + try { + kytea_util = kytea_tagger->getStringUtil(); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea::getStringUtil() failed"); + return; + } +} + +void kytea_fin(grn_ctx *ctx) { + kytea_util = NULL; + + if (kytea_tagger) { + kytea_tagger->~Kytea(); + GRN_PLUGIN_FREE(ctx, kytea_tagger); + kytea_tagger = NULL; + } + + if (kytea_config) { + kytea_config->~KyteaConfig(); + GRN_PLUGIN_FREE(ctx, kytea_config); + kytea_config = NULL; + } + + if (kytea_mutex) { + grn_plugin_mutex_close(ctx, kytea_mutex); + kytea_mutex = NULL; + } +} + +struct grn_tokenizer_kytea { + grn_tokenizer_query *query; + kytea::KyteaSentence sentence; + std::vector tokens; + std::size_t id; + grn_tokenizer_token token; + const char *rest_query_string; + unsigned int rest_query_string_length; + + grn_tokenizer_kytea() : + query(NULL), + sentence(), + tokens(), + id(0), + token(), + rest_query_string(NULL) + { + } + ~grn_tokenizer_kytea() {} +}; + +void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + new (tokenizer) grn_tokenizer_kytea; + grn_tokenizer_token_init(ctx, &tokenizer->token); +} + +void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + grn_tokenizer_token_fin(ctx, &tokenizer->token); + if (tokenizer->query) { + grn_tokenizer_query_close(ctx, tokenizer->query); + } + tokenizer->~grn_tokenizer_kytea(); +} + +grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + unsigned int normalizer_flags = 0; + grn_tokenizer_query * const query = + grn_tokenizer_query_open(ctx, num_args, args, normalizer_flags); + if (!query) { + return NULL; + } + + grn_tokenizer_kytea * const tokenizer = static_cast( + GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea))); + if (!tokenizer) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to grn_tokenizer_kytea failed"); + return NULL; + } + + try { + grn_tokenizer_kytea_init(ctx, tokenizer); + } catch (...) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "tokenizer initialization failed"); + return NULL; + } + + tokenizer->query = query; + + grn_obj *normalized_query = query->normalized_query; + const char *normalized_string; + unsigned int normalized_string_length; + grn_string_get_normalized(ctx, + normalized_query, + &normalized_string, + &normalized_string_length, + NULL); + if (tokenizer->query->have_tokenized_delimiter) { + tokenizer->rest_query_string = normalized_string; + tokenizer->rest_query_string_length = normalized_string_length; + } else { + grn_plugin_mutex_lock(ctx, kytea_mutex); + try { + const std::string str(normalized_string, normalized_string_length); + const kytea::KyteaString &surface_str = kytea_util->mapString(str); + const kytea::KyteaString &normalized_str = kytea_util->normalize(surface_str); + tokenizer->sentence = kytea::KyteaSentence(surface_str, normalized_str); + kytea_tagger->calculateWS(tokenizer->sentence); + } catch (...) { + grn_plugin_mutex_unlock(ctx, kytea_mutex); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "tokenization failed"); + return NULL; + } + grn_plugin_mutex_unlock(ctx, kytea_mutex); + + try { + for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) { + const std::string &token = + kytea_util->showString(tokenizer->sentence.words[i].surface); + const char *ptr = token.c_str(); + unsigned int left = static_cast(token.length()); + while (left > 0) { + const int char_length = + grn_tokenizer_charlen(ctx, ptr, left, query->encoding); + if ((char_length == 0) || + (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) { + break; + } + ptr += char_length; + left -= char_length; + } + if (left == 0) { + tokenizer->tokens.push_back(token); + } + } + } catch (...) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "adjustment failed"); + return NULL; + } + } + + user_data->ptr = tokenizer; + return NULL; +} + +grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast(user_data->ptr); + + if (tokenizer->query->have_tokenized_delimiter) { + unsigned int rest_query_string_length = + tokenizer->rest_query_string_length; + const char *rest_query_string = + grn_tokenizer_tokenized_delimiter_next(ctx, + &(tokenizer->token), + tokenizer->rest_query_string, + rest_query_string_length, + tokenizer->query->encoding); + if (rest_query_string) { + tokenizer->rest_query_string_length -= + rest_query_string - tokenizer->rest_query_string; + } + tokenizer->rest_query_string = rest_query_string; + } else { + const grn_tokenizer_status status = + ((tokenizer->id + 1) < tokenizer->tokens.size()) ? + GRN_TOKENIZER_CONTINUE : GRN_TOKENIZER_LAST; + if (tokenizer->id < tokenizer->tokens.size()) { + const std::string &token = tokenizer->tokens[tokenizer->id++]; + grn_tokenizer_token_push(ctx, &tokenizer->token, + token.c_str(), token.length(), status); + } else { + grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status); + } + } + + return NULL; +} + +grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast(user_data->ptr); + if (tokenizer) { + grn_tokenizer_kytea_fin(ctx, tokenizer); + GRN_PLUGIN_FREE(ctx, tokenizer); + } + return NULL; +} + +} // namespace + +extern "C" { + +/* + GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error + code must be set in `ctx->rc' on failure. + */ +grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { + kytea_init(ctx); + return ctx->rc; +} + +/* + GRN_PLUGIN_REGISTER() registers this plugin to the database associated with + `ctx'. The registration requires the plugin name and the functions to be + called for tokenization. + */ +grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) { + return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init, + grn_kytea_next, grn_kytea_fin); +} + +/* + GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by + GRN_PLUGIN_INIT(). + */ +grn_rc GRN_PLUGIN_FIN(grn_ctx *ctx) { + kytea_fin(ctx); + return GRN_SUCCESS; +} + +} // extern "C" diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am new file mode 100644 index 00000000..182f3857 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am @@ -0,0 +1,2 @@ +kytea_la_SOURCES = \ + kytea.cpp diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c new file mode 100644 index 00000000..cabf2c94 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c @@ -0,0 +1,660 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2009-2016 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG tokenizers_mecab +#endif + +#include + +#include +#include + +#include + +#include +#include +#include + +static unsigned int sole_mecab_init_counter = 0; +static mecab_t *sole_mecab = NULL; +static grn_plugin_mutex *sole_mecab_mutex = NULL; +static grn_encoding sole_mecab_encoding = GRN_ENC_NONE; + +static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE; +static int grn_mecab_chunk_size_threshold = 8192; + +typedef struct { + mecab_t *mecab; + grn_obj buf; + const char *next; + const char *end; + grn_tokenizer_query *query; + grn_tokenizer_token token; +} grn_mecab_tokenizer; + +static const char * +mecab_global_error_message(void) +{ + double version; + + version = atof(mecab_version()); + /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */ + if (version <= 0.993) { + return "Unknown"; + } + + return mecab_strerror(NULL); +} + + +static grn_encoding +translate_mecab_charset_to_grn_encoding(const char *charset) +{ + if (grn_strcasecmp(charset, "euc-jp") == 0) { + return GRN_ENC_EUC_JP; + } else if (grn_strcasecmp(charset, "utf-8") == 0 || + grn_strcasecmp(charset, "utf8") == 0) { + return GRN_ENC_UTF8; + } else if (grn_strcasecmp(charset, "shift_jis") == 0 || + grn_strcasecmp(charset, "shift-jis") == 0 || + grn_strcasecmp(charset, "sjis") == 0) { + return GRN_ENC_SJIS; + } + return GRN_ENC_NONE; +} + +static grn_encoding +get_mecab_encoding(mecab_t *mecab) +{ + grn_encoding encoding = GRN_ENC_NONE; + const mecab_dictionary_info_t *dictionary_info; + dictionary_info = mecab_dictionary_info(mecab); + if (dictionary_info) { + const char *charset = dictionary_info->charset; + encoding = translate_mecab_charset_to_grn_encoding(charset); + } + return encoding; +} + +static inline grn_bool +is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes) +{ + switch (character_bytes) { + case 1 : + switch (character[0]) { + case ',' : + case '.' : + case '!' : + case '?' : + return GRN_TRUE; + default : + return GRN_FALSE; + } + case 3 : + switch ((unsigned char)(character[0])) { + case 0xE3 : + switch ((unsigned char)(character[1])) { + case 0x80 : + switch ((unsigned char)(character[2])) { + case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */ + case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */ + return GRN_TRUE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } + return GRN_FALSE; + case 0xEF : + switch ((unsigned char)(character[1])) { + case 0xBC : + switch ((unsigned char)(character[2])) { + case 0x81 : + /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */ + case 0x9F : + /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */ + return GRN_TRUE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } + return GRN_FALSE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } +} + +static grn_bool +chunked_tokenize_utf8_chunk(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char *chunk, + unsigned int chunk_bytes) +{ + const char *tokenized_chunk; + size_t tokenized_chunk_length; + + tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes); + if (!tokenized_chunk) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab][chunk] " + "mecab_sparse_tostr2() failed len=%d err=%s", + chunk_bytes, + mecab_strerror(tokenizer->mecab)); + return GRN_FALSE; + } + + if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) { + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " "); + } + + tokenized_chunk_length = strlen(tokenized_chunk); + if (tokenized_chunk_length >= 1 && + isspace((unsigned char)tokenized_chunk[tokenized_chunk_length - 1])) { + GRN_TEXT_PUT(ctx, &(tokenizer->buf), + tokenized_chunk, tokenized_chunk_length - 1); + } else { + GRN_TEXT_PUT(ctx, &(tokenizer->buf), + tokenized_chunk, tokenized_chunk_length); + } + + return GRN_TRUE; +} + +static grn_bool +chunked_tokenize_utf8(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char *string, + unsigned int string_bytes) +{ + const char *chunk_start; + const char *current; + const char *last_delimiter; + const char *string_end = string + string_bytes; + grn_encoding encoding = tokenizer->query->encoding; + + if (string_bytes < grn_mecab_chunk_size_threshold) { + return chunked_tokenize_utf8_chunk(ctx, + tokenizer, + string, + string_bytes); + } + + chunk_start = current = string; + last_delimiter = NULL; + while (current < string_end) { + int space_bytes; + int character_bytes; + const char *current_character; + + space_bytes = grn_isspace(current, encoding); + if (space_bytes > 0) { + if (chunk_start != current) { + grn_bool succeeded; + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + if (!succeeded) { + return succeeded; + } + } + current += space_bytes; + chunk_start = current; + last_delimiter = NULL; + continue; + } + + character_bytes = grn_charlen_(ctx, current, string_end, encoding); + if (character_bytes == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab][chunk] " + "invalid byte sequence: position=%d", + (int)(current - string)); + return GRN_FALSE; + } + + current_character = current; + current += character_bytes; + if (is_delimiter_character(ctx, current_character, character_bytes)) { + last_delimiter = current; + } + + if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { + grn_bool succeeded; + if (last_delimiter) { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + last_delimiter - chunk_start); + chunk_start = last_delimiter; + } else { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + chunk_start = current; + } + if (!succeeded) { + return succeeded; + } + last_delimiter = NULL; + } + } + + if (current == chunk_start) { + return GRN_TRUE; + } else { + return chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + } +} + +static mecab_t * +mecab_create(grn_ctx *ctx) +{ + mecab_t *mecab; + int argc = 0; + const char *argv[4]; + + argv[argc++] = "Groonga"; + argv[argc++] = "-Owakati"; +#ifdef GRN_WITH_BUNDLED_MECAB + argv[argc++] = "--rcfile"; +# ifdef WIN32 + { + static char windows_mecab_rc_file[PATH_MAX]; + + grn_strcpy(windows_mecab_rc_file, + PATH_MAX, + grn_plugin_windows_base_dir()); + grn_strcat(windows_mecab_rc_file, + PATH_MAX, + "/"); + grn_strcat(windows_mecab_rc_file, + PATH_MAX, + GRN_BUNDLED_MECAB_RELATIVE_RC_PATH); + { + char *c; + for (c = windows_mecab_rc_file; *c != '\0'; c++) { + if (*c == '/') { + *c = '\\'; + } + } + } + argv[argc++] = windows_mecab_rc_file; + } +# else /* WIN32 */ + argv[argc++] = GRN_BUNDLED_MECAB_RC_PATH; +# endif /* WIN32 */ +#endif /* GRN_WITH_BUNDLED_MECAB */ + mecab = mecab_new(argc, (char **)argv); + + if (!mecab) { +#ifdef GRN_WITH_BUNDLED_MECAB + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] failed to create mecab_t: %s: " + "mecab_new(\"%s\", \"%s\", \"%s\", \"%s\")", + mecab_global_error_message(), + argv[0], argv[1], argv[2], argv[3]); +#else /* GRN_WITH_BUNDLED_MECAB */ + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] failed to create mecab_t: %s: " + "mecab_new(\"%s\", \"%s\")", + mecab_global_error_message(), + argv[0], argv[1]); +#endif /* GRN_WITH_BUNDLED_MECAB */ + } + + return mecab; +} + +/* + This function is called for a full text search query or a document to be + indexed. This means that both short/long strings are given. + The return value of this function is ignored. When an error occurs in this + function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). + */ +static grn_obj * +mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_mecab_tokenizer *tokenizer; + unsigned int normalizer_flags = 0; + grn_tokenizer_query *query; + grn_obj *normalized_query; + const char *normalized_string; + unsigned int normalized_string_length; + + query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); + if (!query) { + return NULL; + } + if (!sole_mecab) { + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + if (!sole_mecab) { + sole_mecab = mecab_create(ctx); + if (sole_mecab) { + sole_mecab_encoding = get_mecab_encoding(sole_mecab); + } + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + } + if (!sole_mecab) { + grn_tokenizer_query_close(ctx, query); + return NULL; + } + + if (query->encoding != sole_mecab_encoding) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab dictionary charset (%s) does not match " + "the table encoding: <%s>", + grn_encoding_to_string(sole_mecab_encoding), + grn_encoding_to_string(query->encoding)); + return NULL; + } + + if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] " + "memory allocation to grn_mecab_tokenizer failed"); + return NULL; + } + tokenizer->mecab = sole_mecab; + tokenizer->query = query; + + normalized_query = query->normalized_query; + grn_string_get_normalized(ctx, + normalized_query, + &normalized_string, + &normalized_string_length, + NULL); + GRN_TEXT_INIT(&(tokenizer->buf), 0); + if (query->have_tokenized_delimiter) { + tokenizer->next = normalized_string; + tokenizer->end = tokenizer->next + normalized_string_length; + } else if (normalized_string_length == 0) { + tokenizer->next = ""; + tokenizer->end = tokenizer->next; + } else { + grn_bool succeeded; + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + if (grn_mecab_chunked_tokenize_enabled && + ctx->encoding == GRN_ENC_UTF8) { + succeeded = chunked_tokenize_utf8(ctx, + tokenizer, + normalized_string, + normalized_string_length); + } else { + const char *s; + s = mecab_sparse_tostr2(tokenizer->mecab, + normalized_string, + normalized_string_length); + if (!s) { + succeeded = GRN_FALSE; + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_sparse_tostr() failed len=%d err=%s", + normalized_string_length, + mecab_strerror(tokenizer->mecab)); + } else { + succeeded = GRN_TRUE; + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + } + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + if (!succeeded) { + grn_tokenizer_query_close(ctx, tokenizer->query); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + { + char *buf, *p; + unsigned int bufsize; + + buf = GRN_TEXT_VALUE(&(tokenizer->buf)); + bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); + /* A certain version of mecab returns trailing lf or spaces. */ + for (p = buf + bufsize - 2; + buf <= p && isspace(*(unsigned char *)p); + p--) { *p = '\0'; } + tokenizer->next = buf; + tokenizer->end = p + 1; + } + } + user_data->ptr = tokenizer; + + grn_tokenizer_token_init(ctx, &(tokenizer->token)); + + return NULL; +} + +/* + This function returns tokens one by one. + */ +static grn_obj * +mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + /* grn_obj *table = args[0]; */ + grn_mecab_tokenizer *tokenizer = user_data->ptr; + grn_encoding encoding = tokenizer->query->encoding; + + if (tokenizer->query->have_tokenized_delimiter) { + tokenizer->next = + grn_tokenizer_tokenized_delimiter_next(ctx, + &(tokenizer->token), + tokenizer->next, + tokenizer->end - tokenizer->next, + encoding); + } else { + size_t cl; + const char *p = tokenizer->next, *r; + const char *e = tokenizer->end; + grn_tokenizer_status status; + + for (r = p; r < e; r += cl) { + int space_len; + + space_len = grn_isspace(r, encoding); + if (space_len > 0 && r == p) { + cl = space_len; + p = r + cl; + continue; + } + + if (!(cl = grn_charlen_(ctx, r, e, encoding))) { + tokenizer->next = e; + break; + } + + if (space_len > 0) { + const char *q = r + space_len; + while (q < e && (space_len = grn_isspace(q, encoding))) { + q += space_len; + } + tokenizer->next = q; + break; + } + } + + if (r == e || tokenizer->next == e) { + status = GRN_TOKENIZER_LAST; + } else { + status = GRN_TOKENIZER_CONTINUE; + } + grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); + } + + return NULL; +} + +/* + This function finalizes a tokenization. + */ +static grn_obj * +mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_mecab_tokenizer *tokenizer = user_data->ptr; + if (!tokenizer) { + return NULL; + } + grn_tokenizer_token_fin(ctx, &(tokenizer->token)); + grn_tokenizer_query_close(ctx, tokenizer->query); + grn_obj_unlink(ctx, &(tokenizer->buf)); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; +} + +static void +check_mecab_dictionary_encoding(grn_ctx *ctx) +{ +#ifdef HAVE_MECAB_DICTIONARY_INFO_T + mecab_t *mecab; + grn_encoding encoding; + grn_bool have_same_encoding_dictionary; + + mecab = mecab_create(ctx); + if (!mecab) { + return; + } + + encoding = GRN_CTX_GET_ENCODING(ctx); + have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); + mecab_destroy(mecab); + + if (!have_same_encoding_dictionary) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab has no dictionary that uses the context encoding" + ": <%s>", + grn_encoding_to_string(encoding)); + } +#endif +} + +/* + This function initializes a plugin. This function fails if there is no + dictionary that uses the context encoding of groonga. + */ +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + ++sole_mecab_init_counter; + if (sole_mecab_init_counter > 1) + { + return GRN_SUCCESS; + } + { + char env[GRN_ENV_BUFFER_SIZE]; + + grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED", + env, + GRN_ENV_BUFFER_SIZE); + grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0); + } + + { + char env[GRN_ENV_BUFFER_SIZE]; + + grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD", + env, + GRN_ENV_BUFFER_SIZE); + if (env[0]) { + int threshold = -1; + const char *end; + const char *rest; + + end = env + strlen(env); + threshold = grn_atoi(env, end, &rest); + if (end > env && end == rest) { + grn_mecab_chunk_size_threshold = threshold; + } + } + } + + sole_mecab = NULL; + sole_mecab_mutex = grn_plugin_mutex_open(ctx); + if (!sole_mecab_mutex) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] grn_plugin_mutex_open() failed"); + return ctx->rc; + } + + check_mecab_dictionary_encoding(ctx); + if (ctx->rc != GRN_SUCCESS) { + grn_plugin_mutex_close(ctx, sole_mecab_mutex); + sole_mecab_mutex = NULL; + } + + return ctx->rc; +} + +/* + This function registers a plugin to a database. + */ +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_tokenizer_register(ctx, "TokenMecab", 10, + mecab_init, mecab_next, mecab_fin); + if (rc == GRN_SUCCESS) { + grn_obj *token_mecab; + token_mecab = grn_ctx_get(ctx, "TokenMecab", 10); + /* Just for backward compatibility. TokenMecab was built-in not plugin. */ + if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) { + rc = GRN_FILE_CORRUPT; + } + } + + return rc; +} + +/* + This function finalizes a plugin. + */ +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + --sole_mecab_init_counter; + if (sole_mecab_init_counter > 0) + { + return GRN_SUCCESS; + } + if (sole_mecab) { + mecab_destroy(sole_mecab); + sole_mecab = NULL; + } + if (sole_mecab_mutex) { + grn_plugin_mutex_close(ctx, sole_mecab_mutex); + sole_mecab_mutex = NULL; + } + + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am new file mode 100644 index 00000000..56912727 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am @@ -0,0 +1,2 @@ +mecab_la_SOURCES = \ + mecab.c -- cgit v1.2.3