diff options
Diffstat (limited to '')
4 files changed, 374 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt new file mode 100644 index 00000000..c2f04cb8 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt @@ -0,0 +1,38 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(QUERY_EXPANDERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/query_expanders") +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/tsv_sources.am TSV_SOURCES) +set_source_files_properties(${TSV_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(tsv_query_expander STATIC ${TSV_SOURCES}) + set_target_properties( + tsv_query_expander + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(tsv_query_expander MODULE ${TSV_SOURCES}) + set_target_properties(tsv_query_expander PROPERTIES + PREFIX "" + OUTPUT_NAME "tsv") + install(TARGETS tsv_query_expander DESTINATION "${QUERY_EXPANDERS_DIR}") +endif() +target_link_libraries(tsv_query_expander libgroonga) diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am b/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am new file mode 100644 index 00000000..96c0911a --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am @@ -0,0 +1,20 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +query_expander_plugins_LTLIBRARIES = +query_expander_plugins_LTLIBRARIES += tsv.la + +include tsv_sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c new file mode 100644 index 00000000..5d5deec6 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c @@ -0,0 +1,314 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012-2015 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv +#endif + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <groonga/plugin.h> + +#include <stdlib.h> +#include <string.h> + +#ifdef WIN32 +# include <windows.h> +# include <share.h> +#endif /* WIN32 */ + +#define MAX_SYNONYM_BYTES 4096 + +static grn_hash *synonyms = NULL; + +#ifdef WIN32 +static char win32_synonyms_file[MAX_PATH] = ""; +const char * +get_system_synonyms_file(void) +{ + if (win32_synonyms_file[0] == '\0') { + const char *base_dir; + const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE; + size_t base_dir_length; + + base_dir = grn_plugin_windows_base_dir(); + base_dir_length = strlen(base_dir); + grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir); + grn_strcat(win32_synonyms_file, MAX_PATH, "/"); + grn_strcat(win32_synonyms_file, MAX_PATH, relative_path); + } + return win32_synonyms_file; +} + +#else /* WIN32 */ +const char * +get_system_synonyms_file(void) +{ + return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE; +} +#endif /* WIN32 */ + +static grn_bool +is_comment_mark(char character) +{ + return character == '#'; +} + +static grn_encoding +detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length) +{ + grn_encoding encoding = GRN_ENC_NONE; + grn_obj null_terminated_line_buffer; + const char *c_line; + const char *coding_part_keyword = "coding: "; + const char *coding_part; + const char *encoding_name; + + GRN_TEXT_INIT(&null_terminated_line_buffer, 0); + GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length); + GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0'); + + c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer); + coding_part = strstr(c_line, coding_part_keyword); + if (coding_part) { + encoding_name = coding_part + strlen(coding_part_keyword); + if (grn_strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 || + grn_strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) { + encoding = GRN_ENC_UTF8; + } else if (grn_strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 || + grn_strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) { + encoding = GRN_ENC_SJIS; + } else if (grn_strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 || + grn_strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) { + encoding = GRN_ENC_EUC_JP; + } else if (grn_strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) { + encoding = GRN_ENC_LATIN1; + } else if (grn_strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 || + grn_strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) { + encoding = GRN_ENC_KOI8R; + } + } else { + encoding = ctx->encoding; + } + GRN_OBJ_FIN(ctx, &null_terminated_line_buffer); + + return encoding; +} + +static grn_encoding +guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length) +{ + const char bom[] = {0xef, 0xbb, 0xbf}; + size_t bom_length = sizeof(bom); + + if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) { + *line += bom_length; + *line_length -= bom_length; + return GRN_ENC_UTF8; + } + + if (!is_comment_mark((*line)[0])) { + return ctx->encoding; + } + + return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1); +} + +static void +parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length, + grn_obj *key, grn_obj *value) +{ + size_t i = 0; + + if (is_comment_mark(line[i])) { + return; + } + + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + break; + } + GRN_TEXT_PUTC(ctx, key, character); + } + + if (i == line_length) { + return; + } + + GRN_TEXT_PUTS(ctx, value, "(("); + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + GRN_TEXT_PUTS(ctx, value, ") OR ("); + } else { + GRN_TEXT_PUTC(ctx, value, character); + } + } + GRN_TEXT_PUTS(ctx, value, "))"); + + { + grn_id id; + void *value_location = NULL; + + id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key), + &value_location, NULL); + if (id == GRN_ID_NIL) { + GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "failed to register key: <%.*s>", + (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key)); + return; + } + + if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) { + GRN_TEXT_PUTC(ctx, value, '\0'); + } else { + grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1); + GRN_TEXT_PUTC(ctx, value, '\0'); + } + grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value)); + } +} + +static void +load_synonyms(grn_ctx *ctx) +{ + static char path_env[GRN_ENV_BUFFER_SIZE]; + const char *path; + grn_file_reader *file_reader; + int number_of_lines; + grn_encoding encoding; + grn_obj line, key, value; + + grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE", + path_env, + GRN_ENV_BUFFER_SIZE); + if (path_env[0]) { + path = path_env; + } else { + path = get_system_synonyms_file(); + } + file_reader = grn_file_reader_open(ctx, path); + if (!file_reader) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "synonyms file doesn't exist: <%s>", + path); + return; + } + + GRN_TEXT_INIT(&line, 0); + GRN_TEXT_INIT(&key, 0); + GRN_TEXT_INIT(&value, 0); + grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); + number_of_lines = 0; + while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) { + const char *line_value = GRN_TEXT_VALUE(&line); + size_t line_length = GRN_TEXT_LEN(&line); + + if (line_length > 0 && line_value[line_length - 1] == '\n') { + if (line_length > 1 && line_value[line_length - 2] == '\r') { + line_length -= 2; + } else { + line_length -= 1; + } + } + number_of_lines++; + if (number_of_lines == 1) { + encoding = guess_encoding(ctx, &line_value, &line_length); + } + GRN_BULK_REWIND(&key); + GRN_BULK_REWIND(&value); + parse_synonyms_file_line(ctx, line_value, line_length, &key, &value); + GRN_BULK_REWIND(&line); + } + GRN_OBJ_FIN(ctx, &line); + GRN_OBJ_FIN(ctx, &key); + GRN_OBJ_FIN(ctx, &value); + + grn_file_reader_close(ctx, file_reader); +} + +static grn_obj * +func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + grn_rc rc = GRN_END_OF_DATA; + grn_id id; + grn_obj *term, *expanded_term; + void *value; + grn_obj *rc_object; + + term = args[0]; + expanded_term = args[1]; + id = grn_hash_get(ctx, synonyms, + GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term), + &value); + if (id != GRN_ID_NIL) { + const char *query = value; + GRN_TEXT_PUTS(ctx, expanded_term, query); + rc = GRN_SUCCESS; + } + + rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0); + if (rc_object) { + GRN_INT32_SET(ctx, rc_object, rc); + } + + return rc_object; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + if (!synonyms) { + synonyms = grn_hash_create(ctx, NULL, + GRN_TABLE_MAX_KEY_SIZE, + MAX_SYNONYM_BYTES, + GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); + if (!synonyms) { + return ctx->rc; + } + load_synonyms(ctx); + } + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"), + GRN_PROC_FUNCTION, + func_query_expander_tsv, NULL, NULL, + 0, NULL); + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + if (synonyms) { + grn_hash_close(ctx, synonyms); + synonyms = NULL; + } + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am new file mode 100644 index 00000000..f1bdabed --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am @@ -0,0 +1,2 @@ +tsv_la_SOURCES = \ + tsv.c |