diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/plugins')
58 files changed, 7970 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/plugins/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/CMakeLists.txt new file mode 100644 index 00000000..d7a5f22e --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright(C) 2012-2016 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +add_subdirectory(suggest) +add_subdirectory(tokenizers) +add_subdirectory(query_expanders) +add_subdirectory(ruby) +add_subdirectory(token_filters) +add_subdirectory(sharding) +add_subdirectory(functions) + +if(NOT GRN_EMBED) + if(GRN_WITH_MRUBY) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/ruby_scripts.am RUBY_SCRIPTS) + install(FILES ${RUBY_SCRIPTS} + DESTINATION "${GRN_RELATIVE_PLUGINS_DIR}") + endif() +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/Makefile.am b/storage/mroonga/vendor/groonga/plugins/Makefile.am new file mode 100644 index 00000000..6c98a0cc --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/Makefile.am @@ -0,0 +1,19 @@ +SUBDIRS = \ + tokenizers \ + suggest \ + query_expanders \ + ruby \ + token_filters \ + sharding \ + functions \ + expression_rewriters + +EXTRA_DIST = \ + CMakeLists.txt + +if WITH_MRUBY +dist_plugins_DATA = \ + $(ruby_scripts) +endif + +include ruby_scripts.am diff --git a/storage/mroonga/vendor/groonga/plugins/expression_rewriters/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/CMakeLists.txt new file mode 100644 index 00000000..aabec442 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright(C) 2015 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +if(NOT GRN_EMBED) + if(GRN_WITH_MRUBY) + set(GRN_RELATIVE_EXPRESSION_REWRITER_PLUGINS_DIR + "${GRN_RELATIVE_PLUGINS_DIR}/expression_rewriters") + + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/sources.am + EXPRESSION_REWRITERS) + install(FILES ${EXPRESSION_REWRITERS} + DESTINATION "${GRN_RELATIVE_SEXPRESSION_REWRITER_PLUGINS_DIR}") + endif() +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/expression_rewriters/Makefile.am b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/Makefile.am new file mode 100644 index 00000000..60a032ac --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/Makefile.am @@ -0,0 +1,9 @@ +EXTRA_DIST = \ + CMakeLists.txt + +if WITH_MRUBY +dist_expression_rewriter_plugins_DATA = \ + $(expression_rewriters) +endif + +include sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/expression_rewriters/optimizer.rb b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/optimizer.rb new file mode 100644 index 00000000..3dfee681 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/optimizer.rb @@ -0,0 +1,147 @@ +module Groonga + module ExpressionRewriters + class Optimizer < ExpressionRewriter + register "optimizer" + + def rewrite + builder = ExpressionTreeBuilder.new(@expression) + root_node = builder.build + + variable = @expression[0] + table = context[variable.domain] + optimized_root_node = optimize_node(table, root_node) + + rewritten = Expression.create(table) + optimized_root_node.build(rewritten) + rewritten + end + + private + def optimize_node(table, node) + case node + when ExpressionTree::LogicalOperation + optimized_sub_nodes = node.nodes.collect do |sub_node| + optimize_node(table, sub_node) + end + case node.operator + when Operator::AND + optimized_sub_nodes = + optimize_and_sub_nodes(table, optimized_sub_nodes) + end + ExpressionTree::LogicalOperation.new(node.operator, + optimized_sub_nodes) + when ExpressionTree::BinaryOperation + optimized_left = optimize_node(table, node.left) + optimized_right = optimize_node(table, node.right) + if optimized_left.is_a?(ExpressionTree::Constant) and + optimized_right.is_a?(ExpressionTree::Variable) + ExpressionTree::BinaryOperation.new(node.operator, + optimized_right, + optimized_left) + elsif node.left == optimized_left and node.right == optimized_right + node + else + ExpressionTree::BinaryOperation.new(node.operator, + optimized_left, + optimized_right) + end + else + node + end + end + + def optimize_and_sub_nodes(table, sub_nodes) + grouped_sub_nodes = sub_nodes.group_by do |sub_node| + case sub_node + when ExpressionTree::BinaryOperation + if sub_node.left.is_a?(ExpressionTree::Variable) + sub_node.left.column + else + nil + end + else + nil + end + end + + optimized_nodes = [] + grouped_sub_nodes.each do |column, grouped_nodes| + if column + grouped_nodes = optimize_grouped_nodes(column, grouped_nodes) + end + optimized_nodes.concat(grouped_nodes) + end + + optimized_nodes.sort_by do |node| + node.estimate_size(table) + end + end + + COMPARISON_OPERATORS = [ + Operator::EQUAL, + Operator::NOT_EQUAL, + Operator::LESS, + Operator::GREATER, + Operator::LESS_EQUAL, + Operator::GREATER_EQUAL, + ] + def optimize_grouped_nodes(column, grouped_nodes) + target_nodes, done_nodes = grouped_nodes.partition do |node| + node.is_a?(ExpressionTree::BinaryOperation) and + COMPARISON_OPERATORS.include?(node.operator) and + node.right.is_a?(ExpressionTree::Constant) + end + + # TODO: target_nodes = remove_needless_nodes(target_nodes) + # e.g.: x < 1 && x < 3 -> x < 1: (x < 3) is meaningless + + if target_nodes.size == 2 + between_node = try_optimize_between(column, target_nodes) + if between_node + done_nodes << between_node + else + done_nodes.concat(target_nodes) + end + else + done_nodes.concat(target_nodes) + end + + done_nodes + end + + def try_optimize_between(column, target_nodes) + greater_node = nil + less_node = nil + target_nodes.each do |node| + case node.operator + when Operator::GREATER, Operator::GREATER_EQUAL + greater_node = node + when Operator::LESS, Operator::LESS_EQUAL + less_node = node + end + end + return nil if greater_node.nil? or less_node.nil? + + between = ExpressionTree::Procedure.new(context["between"]) + if greater_node.operator == Operator::GREATER + greater_border = "exclude" + else + greater_border = "include" + end + if less_node.operator == Operator::LESS + less_border = "exclude" + else + less_border = "include" + end + arguments = [ + ExpressionTree::Variable.new(column), + greater_node.right, + ExpressionTree::Constant.new(greater_border), + less_node.right, + ExpressionTree::Constant.new(less_border), + ] + ExpressionTree::FunctionCall.new(between, arguments) + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/expression_rewriters/sources.am b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/sources.am new file mode 100644 index 00000000..7670bed6 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/expression_rewriters/sources.am @@ -0,0 +1,2 @@ +expression_rewriters = \ + optimizer.rb diff --git a/storage/mroonga/vendor/groonga/plugins/functions/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/functions/CMakeLists.txt new file mode 100644 index 00000000..65a6d2c3 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/CMakeLists.txt @@ -0,0 +1,141 @@ +# Copyright(C) 2015-2017 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(GRN_FUNCTIONS_PLUGIN_DIR "${GRN_RELATIVE_PLUGINS_DIR}/functions") + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/vector_sources.am + VECTOR_SOURCES) +set_source_files_properties(${VECTOR_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(vector_functions STATIC ${VECTOR_SOURCES}) + set_target_properties( + vector_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(vector_functions MODULE ${VECTOR_SOURCES}) + set_target_properties(vector_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "vector") + install(TARGETS vector_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(vector_functions libgroonga) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/string_sources.am + STRING_SOURCES) +set_source_files_properties(${STRING_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(string_functions STATIC ${STRING_SOURCES}) + set_target_properties( + string_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(string_functions MODULE ${STRING_SOURCES}) + set_target_properties(string_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "string") + install(TARGETS string_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(string_functions libgroonga) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/number_sources.am + NUMBER_SOURCES) +set_source_files_properties(${NUMBER_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(number_functions STATIC ${NUMBER_SOURCES}) + set_target_properties( + number_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(number_functions MODULE ${NUMBER_SOURCES}) + set_target_properties(number_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "number") + install(TARGETS number_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(number_functions libgroonga "${M_LIBS}") + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/time_sources.am + TIME_SOURCES) +set_source_files_properties(${TIME_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(time_functions STATIC ${TIME_SOURCES}) + set_target_properties( + time_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(time_functions MODULE ${TIME_SOURCES}) + set_target_properties(time_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "time") + install(TARGETS time_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(time_functions libgroonga) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/index_column_sources.am + INDEX_COLUMN_SOURCES) +set_source_files_properties(${INDEX_COLUMN_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(index_column_functions STATIC ${INDEX_COLUMN_SOURCES}) + set_target_properties( + index_column_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(index_column_functions MODULE ${INDEX_COLUMN_SOURCES}) + set_target_properties(index_column_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "index_column") + install(TARGETS index_column_functions + DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(index_column_functions libgroonga) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/math_sources.am + MATH_SOURCES) +set_source_files_properties(${MATH_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(math_functions STATIC ${MATH_SOURCES}) + set_target_properties( + math_functions + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(math_functions MODULE ${MATH_SOURCES}) + set_target_properties(math_functions PROPERTIES + PREFIX "" + OUTPUT_NAME "math") + install(TARGETS math_functions DESTINATION "${GRN_FUNCTIONS_PLUGIN_DIR}") +endif() +target_link_libraries(math_functions libgroonga) diff --git a/storage/mroonga/vendor/groonga/plugins/functions/Makefile.am b/storage/mroonga/vendor/groonga/plugins/functions/Makefile.am new file mode 100644 index 00000000..f57ee031 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/Makefile.am @@ -0,0 +1,33 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +function_plugins_LTLIBRARIES = +function_plugins_LTLIBRARIES += vector.la +function_plugins_LTLIBRARIES += string.la +function_plugins_LTLIBRARIES += number.la +function_plugins_LTLIBRARIES += time.la +function_plugins_LTLIBRARIES += index_column.la +function_plugins_LTLIBRARIES += math.la + +include vector_sources.am +include string_sources.am +include number_sources.am +include time_sources.am +include index_column_sources.am +include math_sources.am + +number_la_LIBADD = -lm +math_la_LIBADD = -lm diff --git a/storage/mroonga/vendor/groonga/plugins/functions/index_column.c b/storage/mroonga/vendor/groonga/plugins/functions/index_column.c new file mode 100644 index 00000000..05010074 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/index_column.c @@ -0,0 +1,266 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2017 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_time +#endif + +#include <groonga/plugin.h> + +static grn_rc +selector_index_column_df_ratio_between(grn_ctx *ctx, + grn_obj *table, + grn_obj *index, + int n_args, + grn_obj **args, + grn_obj *res, + grn_operator op) +{ + grn_rc rc = GRN_SUCCESS; + grn_obj *index_column; + grn_ii *ii; + double min; + double max; + grn_obj *source_table; + unsigned int n_documents; + grn_posting posting; + + if ((n_args - 1) != 3) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio_between(): " + "wrong number of arguments (%d for 3)", n_args - 1); + rc = ctx->rc; + goto exit; + } + + index_column = args[1]; + ii = (grn_ii *)index_column; + min = GRN_FLOAT_VALUE(args[2]); + max = GRN_FLOAT_VALUE(args[3]); + + source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); + n_documents = grn_table_size(ctx, source_table); + memset(&posting, 0, sizeof(grn_posting)); + posting.sid = 1; + + if (op == GRN_OP_AND) { + GRN_TABLE_EACH_BEGIN(ctx, res, cursor, record_id) { + void *key; + grn_id term_id; + uint32_t n_match_documents; + double df_ratio; + + grn_table_cursor_get_key(ctx, cursor, &key); + term_id = *(grn_id *)key; + n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); + if (n_match_documents > n_documents) { + n_match_documents = n_documents; + } + df_ratio = (double)n_match_documents / (double)n_documents; + if (min <= df_ratio && df_ratio <= max) { + posting.rid = term_id; + grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); + } + } GRN_TABLE_EACH_END(ctx, cursor); + grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op); + } else { + GRN_TABLE_EACH_BEGIN(ctx, table, cursor, term_id) { + uint32_t n_match_documents; + double df_ratio; + + n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); + if (n_match_documents > n_documents) { + n_match_documents = n_documents; + } + df_ratio = (double)n_match_documents / (double)n_documents; + { + void *key; + int key_size; + key_size = grn_table_cursor_get_key(ctx, cursor, &key); + } + if (min <= df_ratio && df_ratio <= max) { + posting.rid = term_id; + grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); + } + } GRN_TABLE_EACH_END(ctx, cursor); + } + +exit : + return rc; +} + +static grn_obj * +func_index_column_df_ratio(grn_ctx *ctx, + int n_args, + grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *term_table; + grn_obj *index_column_name; + grn_obj *index_column; + grn_ii *ii; + grn_id term_id; + + if (n_args != 1) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "wrong number of arguments (%d for 1)", n_args - 1); + return NULL; + } + + { + grn_obj *expr; + grn_obj *variable; + + expr = grn_plugin_proc_get_caller(ctx, user_data); + if (!expr) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "called directly"); + return NULL; + } + + variable = grn_expr_get_var_by_offset(ctx, expr, 0); + if (!variable) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "caller expression must have target record information"); + return NULL; + } + + term_table = grn_ctx_at(ctx, variable->header.domain); + term_id = GRN_RECORD_VALUE(variable); + while (GRN_TRUE) { + grn_obj *key_type; + + key_type = grn_ctx_at(ctx, term_table->header.domain); + if (!grn_obj_is_table(ctx, key_type)) { + break; + } + + grn_table_get_key(ctx, term_table, term_id, &term_id, sizeof(grn_id)); + term_table = key_type; + } + } + + index_column_name = args[0]; + if (!grn_obj_is_text_family_bulk(ctx, index_column_name)) { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, index_column_name); + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "the first argument must be index column name: %.*s", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + index_column = grn_obj_column(ctx, + term_table, + GRN_TEXT_VALUE(index_column_name), + GRN_TEXT_LEN(index_column_name)); + if (!index_column) { + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "nonexistent object: <%.*s>", + (int)GRN_TEXT_LEN(index_column_name), + GRN_TEXT_VALUE(index_column_name)); + return NULL; + } + + if (!grn_obj_is_index_column(ctx, index_column)) { + grn_obj inspected; + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, index_column); + GRN_PLUGIN_ERROR(ctx, + GRN_INVALID_ARGUMENT, + "index_column_df_ratio(): " + "the first argument must be index column: %.*s", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + if (grn_obj_is_accessor(ctx, index_column)) { + grn_obj_unlink(ctx, index_column); + } + return NULL; + } + + ii = (grn_ii *)index_column; + + { + grn_obj *source_table; + unsigned int n_documents; + uint32_t n_match_documents; + double df_ratio; + grn_obj *df_ratio_value; + + source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); + n_documents = grn_table_size(ctx, source_table); + n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); + if (n_match_documents > n_documents) { + n_match_documents = n_documents; + } + df_ratio = (double)n_match_documents / (double)n_documents; + + df_ratio_value = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_FLOAT, 0); + if (!df_ratio_value) { + return NULL; + } + GRN_FLOAT_SET(ctx, df_ratio_value, df_ratio); + return df_ratio_value; + } +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_obj *selector_proc; + + selector_proc = grn_proc_create(ctx, "index_column_df_ratio_between", -1, + GRN_PROC_FUNCTION, + NULL, NULL, NULL, 0, NULL); + grn_proc_set_selector(ctx, selector_proc, + selector_index_column_df_ratio_between); + grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_NOP); + + grn_proc_create(ctx, "index_column_df_ratio", -1, + GRN_PROC_FUNCTION, + func_index_column_df_ratio, NULL, NULL, 0, NULL); + + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/functions/index_column_sources.am b/storage/mroonga/vendor/groonga/plugins/functions/index_column_sources.am new file mode 100644 index 00000000..261907bc --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/index_column_sources.am @@ -0,0 +1,2 @@ +index_column_la_SOURCES = \ + index_column.c diff --git a/storage/mroonga/vendor/groonga/plugins/functions/math.c b/storage/mroonga/vendor/groonga/plugins/functions/math.c new file mode 100644 index 00000000..a6a9e260 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/math.c @@ -0,0 +1,142 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2017 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_math +#endif + +#include <groonga/plugin.h> + +#include <math.h> +#include <stdlib.h> + +static grn_obj * +func_math_abs(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *number; + grn_obj *grn_abs_number = NULL; + + if (n_args != 1) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "math_abs(): wrong number of arguments (%d for 1)", + n_args); + return NULL; + } + + number = args[0]; + if (!(number->header.type == GRN_BULK && + grn_type_id_is_number_family(ctx, number->header.domain))) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, number); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "math_abs(): the first argument must be a number: " + "<%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + +#define ABS_AS_IS(return_type, to_type, getter, setter) { \ + grn_abs_number = grn_plugin_proc_alloc(ctx, \ + user_data, \ + (return_type), \ + 0); \ + if (!grn_abs_number) { \ + return NULL; \ + } \ + setter(ctx, grn_abs_number, getter(number)); \ + } +#define ABS_CONVERT_TYPE(func, return_type, to_type, getter, setter) { \ + grn_abs_number = grn_plugin_proc_alloc(ctx, \ + user_data, \ + (return_type), \ + 0); \ + if (!grn_abs_number) { \ + return NULL; \ + } else { \ + to_type abs_number_raw = (to_type)(func)(getter(number)); \ + setter(ctx, grn_abs_number, abs_number_raw); \ + } \ + } + + switch (number->header.domain) { + case GRN_DB_INT8: + ABS_CONVERT_TYPE(abs, GRN_DB_UINT8, uint8_t, GRN_INT8_VALUE, GRN_UINT8_SET); + break; + case GRN_DB_UINT8: + ABS_AS_IS(GRN_DB_UINT8, uint8_t, GRN_UINT8_VALUE, GRN_UINT8_SET); + break; + case GRN_DB_INT16: + ABS_CONVERT_TYPE(abs, GRN_DB_UINT16, uint16_t, GRN_INT16_VALUE, GRN_UINT16_SET); + break; + case GRN_DB_UINT16: + ABS_AS_IS(GRN_DB_UINT16, uint16_t, GRN_UINT16_VALUE, GRN_UINT16_SET); + break; + case GRN_DB_INT32: + ABS_CONVERT_TYPE(labs, GRN_DB_UINT32, uint32_t, GRN_INT32_VALUE, GRN_UINT32_SET); + break; + case GRN_DB_UINT32: + ABS_AS_IS(GRN_DB_UINT32, uint32_t, GRN_UINT32_VALUE, GRN_UINT32_SET); + break; + case GRN_DB_INT64: + ABS_CONVERT_TYPE(llabs, GRN_DB_UINT64, uint64_t, GRN_INT64_VALUE, GRN_UINT64_SET); + break; + case GRN_DB_UINT64: + ABS_AS_IS(GRN_DB_UINT64, uint64_t, GRN_UINT64_VALUE, GRN_UINT64_SET); + break; + case GRN_DB_FLOAT: + ABS_CONVERT_TYPE(fabs, GRN_DB_FLOAT, double, GRN_FLOAT_VALUE, GRN_FLOAT_SET); + break; + default : + break; + } +#undef ABS_CONVERT_TYPE +#undef ABS_AS_IS + + return grn_abs_number; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc = GRN_SUCCESS; + + grn_proc_create(ctx, + "math_abs", -1, + GRN_PROC_FUNCTION, + func_math_abs, + NULL, NULL, 0, NULL); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/functions/math_sources.am b/storage/mroonga/vendor/groonga/plugins/functions/math_sources.am new file mode 100644 index 00000000..8c14ca74 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/math_sources.am @@ -0,0 +1,2 @@ +math_la_SOURCES = \ + math.c diff --git a/storage/mroonga/vendor/groonga/plugins/functions/number.c b/storage/mroonga/vendor/groonga/plugins/functions/number.c new file mode 100644 index 00000000..7cdfc0e1 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/number.c @@ -0,0 +1,187 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2016 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_number +#endif + +#include <groonga/plugin.h> + +#include <math.h> + +static grn_obj * +func_number_classify(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *number; + grn_obj *interval; + grn_obj casted_interval; + grn_obj *classed_number; + + if (n_args != 2) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "number_classify(): wrong number of arguments (%d for 2)", + n_args); + return NULL; + } + + number = args[0]; + if (!(number->header.type == GRN_BULK && + grn_type_id_is_number_family(ctx, number->header.domain))) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, number); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "number_classify(): the first argument must be a number: " + "<%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + interval = args[1]; + if (!(interval->header.type == GRN_BULK && + grn_type_id_is_number_family(ctx, interval->header.domain))) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, interval); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "number_classify(): the second argument must be a number: " + "<%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + classed_number = grn_plugin_proc_alloc(ctx, + user_data, + number->header.domain, + 0); + if (!classed_number) { + return NULL; + } + + GRN_VALUE_FIX_SIZE_INIT(&casted_interval, 0, number->header.domain); + grn_obj_cast(ctx, interval, &casted_interval, GRN_FALSE); + +#define CLASSIFY_RAW(type, getter, setter, classifier) { \ + type number_raw; \ + type interval_raw; \ + type class_raw; \ + type classed_number_raw; \ + \ + number_raw = getter(number); \ + interval_raw = getter(&casted_interval); \ + class_raw = classifier(number_raw, interval_raw); \ + classed_number_raw = class_raw * interval_raw; \ + setter(ctx, classed_number, classed_number_raw); \ + } + +#define CLASSIFIER_INT(number_raw, interval_raw) \ + (number_raw) < 0 ? \ + ((((number_raw) + 1) / (interval_raw)) - 1) : \ + (((number_raw) / (interval_raw))) + +#define CLASSIFY_INT(type, getter, setter) \ + CLASSIFY_RAW(type, getter, setter, CLASSIFIER_INT) + +#define CLASSIFIER_UINT(number_raw, interval_raw) \ + ((number_raw) / (interval_raw)) + +#define CLASSIFY_UINT(type, getter, setter) \ + CLASSIFY_RAW(type, getter, setter, CLASSIFIER_UINT) + +#define CLASSIFIER_FLOAT(number_raw, interval_raw) \ + floor((number_raw) / (interval_raw)) + +#define CLASSIFY_FLOAT(getter, setter) \ + CLASSIFY_RAW(double, getter, setter, CLASSIFIER_FLOAT) + + switch (number->header.domain) { + case GRN_DB_INT8 : + CLASSIFY_INT(int8_t, GRN_INT8_VALUE, GRN_INT8_SET); + break; + case GRN_DB_UINT8 : + CLASSIFY_UINT(uint8_t, GRN_UINT8_VALUE, GRN_UINT8_SET); + break; + case GRN_DB_INT16 : + CLASSIFY_INT(int16_t, GRN_INT16_VALUE, GRN_INT16_SET); + break; + case GRN_DB_UINT16 : + CLASSIFY_UINT(uint16_t, GRN_UINT16_VALUE, GRN_UINT16_SET); + break; + case GRN_DB_INT32 : + CLASSIFY_INT(int32_t, GRN_INT32_VALUE, GRN_INT32_SET); + break; + case GRN_DB_UINT32 : + CLASSIFY_UINT(uint32_t, GRN_UINT32_VALUE, GRN_UINT32_SET); + break; + case GRN_DB_INT64 : + CLASSIFY_INT(int64_t, GRN_INT64_VALUE, GRN_INT64_SET); + break; + case GRN_DB_UINT64 : + CLASSIFY_UINT(uint64_t, GRN_UINT64_VALUE, GRN_UINT64_SET); + break; + case GRN_DB_FLOAT : + CLASSIFY_FLOAT(GRN_FLOAT_VALUE, GRN_FLOAT_SET); + break; + default : + break; + } +#undef CLASSIFY_FLOAT +#undef CLASSIFIER_FLAOT +#undef CLASSIFY_UINT +#undef CLASSIFIER_UINT +#undef CLASSIFY_INT +#undef CLASSIFIER_INT +#undef CLASSIFY_RAW + + GRN_OBJ_FIN(ctx, &casted_interval); + + return classed_number; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc = GRN_SUCCESS; + + grn_proc_create(ctx, + "number_classify", -1, + GRN_PROC_FUNCTION, + func_number_classify, + NULL, NULL, 0, NULL); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/functions/number_sources.am b/storage/mroonga/vendor/groonga/plugins/functions/number_sources.am new file mode 100644 index 00000000..b3d9483b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/number_sources.am @@ -0,0 +1,2 @@ +number_la_SOURCES = \ + number.c diff --git a/storage/mroonga/vendor/groonga/plugins/functions/string.c b/storage/mroonga/vendor/groonga/plugins/functions/string.c new file mode 100644 index 00000000..0af2d6ab --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/string.c @@ -0,0 +1,299 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2016 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_string +#endif + +#include <groonga/plugin.h> + +/* + * func_string_length() returns the number of characters in a string. + * If the string contains an invalid byte sequence, this function returns the + * number of characters before the invalid byte sequence. + */ +static grn_obj * +func_string_length(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *target; + unsigned int length = 0; + grn_obj *grn_length; + + if (n_args != 1) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "string_length(): wrong number of arguments (%d for 1)", + n_args); + return NULL; + } + + target = args[0]; + if (!(target->header.type == GRN_BULK && + ((target->header.domain == GRN_DB_SHORT_TEXT) || + (target->header.domain == GRN_DB_TEXT) || + (target->header.domain == GRN_DB_LONG_TEXT)))) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, target); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "string_length(): target object must be a text bulk: " + "<%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + { + const char *s = GRN_TEXT_VALUE(target); + const char *e = GRN_TEXT_VALUE(target) + GRN_TEXT_LEN(target); + const char *p; + unsigned int cl = 0; + for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) { + length++; + } + } + + grn_length = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0); + if (!grn_length) { + return NULL; + } + + GRN_UINT32_SET(ctx, grn_length, length); + + return grn_length; +} + +static grn_obj * +func_string_substring(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *target; + grn_obj *from_raw; + grn_obj *length_raw = NULL; + int64_t from = 0; + int64_t length = -1; + const char *start = NULL; + const char *end = NULL; + grn_obj *substring; + + if (n_args < 2) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "string_substring(): wrong number of arguments (%d for 2..3)", + n_args); + return NULL; + } + + target = args[0]; + from_raw = args[1]; + if (n_args == 3) { + length_raw = args[2]; + } + + if (!(target->header.type == GRN_BULK && + grn_type_id_is_text_family(ctx, target->header.domain))) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, target); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "string_substring(): target object must be a text bulk: " + "<%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + /* TODO: extract as grn_func_arg_int64() */ + if (!grn_type_id_is_number_family(ctx, from_raw->header.domain)) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, from_raw); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "string_substring(): from must be a number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + if (from_raw->header.domain == GRN_DB_INT32) { + from = GRN_INT32_VALUE(from_raw); + } else if (from_raw->header.domain == GRN_DB_INT64) { + from = GRN_INT64_VALUE(from_raw); + } else { + grn_obj buffer; + grn_rc rc; + + GRN_INT64_INIT(&buffer, 0); + rc = grn_obj_cast(ctx, from_raw, &buffer, GRN_FALSE); + if (rc == GRN_SUCCESS) { + from = GRN_INT64_VALUE(&buffer); + } + GRN_OBJ_FIN(ctx, &buffer); + + if (rc != GRN_SUCCESS) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, from_raw); + GRN_PLUGIN_ERROR(ctx, rc, + "string_substring(): " + "failed to cast from value to number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + } + + if (length_raw) { + /* TODO: extract as grn_func_arg_int64() */ + if (!grn_type_id_is_number_family(ctx, length_raw->header.domain)) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, length_raw); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "string_substring(): length must be a number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + if (length_raw->header.domain == GRN_DB_INT32) { + length = GRN_INT32_VALUE(length_raw); + } else if (length_raw->header.domain == GRN_DB_INT64) { + length = GRN_INT64_VALUE(length_raw); + } else { + grn_obj buffer; + grn_rc rc; + + GRN_INT64_INIT(&buffer, 0); + rc = grn_obj_cast(ctx, length_raw, &buffer, GRN_FALSE); + if (rc == GRN_SUCCESS) { + length = GRN_INT64_VALUE(&buffer); + } + GRN_OBJ_FIN(ctx, &buffer); + + if (rc != GRN_SUCCESS) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, length_raw); + GRN_PLUGIN_ERROR(ctx, rc, + "string_substring(): " + "failed to cast length value to number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + } + } + + substring = grn_plugin_proc_alloc(ctx, user_data, target->header.domain, 0); + if (!substring) { + return NULL; + } + + GRN_BULK_REWIND(substring); + + if (GRN_TEXT_LEN(target) == 0) { + return substring; + } + if (length == 0) { + return substring; + } + + while (from < 0) { + from += GRN_TEXT_LEN(target); + } + + { + const char *p; + + start = NULL; + p = GRN_TEXT_VALUE(target); + end = p + GRN_TEXT_LEN(target); + + if (from == 0) { + start = p; + } else { + unsigned int char_length = 0; + int64_t n_chars = 0; + + for (; + p < end && (char_length = grn_charlen(ctx, p, end)); + p += char_length, n_chars++) { + if (n_chars == from) { + start = p; + break; + } + } + } + + if (start && length > 0) { + unsigned int char_length = 0; + int64_t n_chars = 0; + + for (; + p < end && (char_length = grn_charlen(ctx, p, end)); + p += char_length, n_chars++) { + if (n_chars == length) { + end = p; + break; + } + } + } + } + + if (start) { + GRN_TEXT_SET(ctx, substring, start, end - start); + } + + return substring; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc = GRN_SUCCESS; + + grn_proc_create(ctx, "string_length", -1, GRN_PROC_FUNCTION, func_string_length, + NULL, NULL, 0, NULL); + + grn_proc_create(ctx, "string_substring", -1, GRN_PROC_FUNCTION, func_string_substring, + NULL, NULL, 0, NULL); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/functions/string_sources.am b/storage/mroonga/vendor/groonga/plugins/functions/string_sources.am new file mode 100644 index 00000000..3477e58a --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/string_sources.am @@ -0,0 +1,2 @@ +string_la_SOURCES = \ + string.c diff --git a/storage/mroonga/vendor/groonga/plugins/functions/time.c b/storage/mroonga/vendor/groonga/plugins/functions/time.c new file mode 100644 index 00000000..f82ea872 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/time.c @@ -0,0 +1,376 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2016 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_time +#endif + +#include <groonga/plugin.h> + +#include <math.h> + +typedef enum { + GRN_TIME_CLASSIFY_UNIT_SECOND, + GRN_TIME_CLASSIFY_UNIT_MINUTE, + GRN_TIME_CLASSIFY_UNIT_HOUR, + GRN_TIME_CLASSIFY_UNIT_DAY, + GRN_TIME_CLASSIFY_UNIT_WEEK, + GRN_TIME_CLASSIFY_UNIT_MONTH, + GRN_TIME_CLASSIFY_UNIT_YEAR +} grn_time_classify_unit; + +static grn_obj * +func_time_classify_raw(grn_ctx *ctx, + int n_args, + grn_obj **args, + grn_user_data *user_data, + const char *function_name, + grn_time_classify_unit unit) +{ + grn_obj *time; + uint32_t interval_raw = 1; + grn_obj *classed_time; + grn_bool accept_interval = GRN_TRUE; + + switch (unit) { + case GRN_TIME_CLASSIFY_UNIT_SECOND : + case GRN_TIME_CLASSIFY_UNIT_MINUTE : + case GRN_TIME_CLASSIFY_UNIT_HOUR : + accept_interval = GRN_TRUE; + break; + case GRN_TIME_CLASSIFY_UNIT_DAY : + case GRN_TIME_CLASSIFY_UNIT_WEEK : + accept_interval = GRN_FALSE; + break; + case GRN_TIME_CLASSIFY_UNIT_MONTH : + case GRN_TIME_CLASSIFY_UNIT_YEAR : + accept_interval = GRN_TRUE; + break; + } + + if (accept_interval) { + if (!(n_args == 1 || n_args == 2)) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s(): " + "wrong number of arguments (%d for 1..2)", + function_name, + n_args); + return NULL; + } + } else { + if (n_args != 1) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s(): " + "wrong number of arguments (%d for 1)", + function_name, + n_args); + return NULL; + } + } + + time = args[0]; + if (!(time->header.type == GRN_BULK && + time->header.domain == GRN_DB_TIME)) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, time); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s(): " + "the first argument must be a time: " + "<%.*s>", + function_name, + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + if (n_args == 2) { + grn_obj *interval; + grn_obj casted_interval; + + interval = args[1]; + if (!(interval->header.type == GRN_BULK && + grn_type_id_is_number_family(ctx, interval->header.domain))) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, interval); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s(): " + "the second argument must be a number: " + "<%.*s>", + function_name, + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + + GRN_VALUE_FIX_SIZE_INIT(&casted_interval, 0, GRN_DB_UINT32); + grn_obj_cast(ctx, interval, &casted_interval, GRN_FALSE); + interval_raw = GRN_UINT32_VALUE(&casted_interval); + GRN_OBJ_FIN(ctx, &casted_interval); + + if (interval_raw == 0) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, interval); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "%s(): " + "the second argument must not be zero: " + "<%.*s>", + function_name, + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + } + + { + int64_t time_raw; + struct tm tm; + int64_t classed_time_raw; + + time_raw = GRN_TIME_VALUE(time); + if (!grn_time_to_tm(ctx, time_raw, &tm)) { + return NULL; + } + + switch (unit) { + case GRN_TIME_CLASSIFY_UNIT_SECOND : + tm.tm_sec = (tm.tm_sec / interval_raw) * interval_raw; + break; + case GRN_TIME_CLASSIFY_UNIT_MINUTE : + tm.tm_min = (tm.tm_min / interval_raw) * interval_raw; + tm.tm_sec = 0; + break; + case GRN_TIME_CLASSIFY_UNIT_HOUR : + tm.tm_hour = (tm.tm_hour / interval_raw) * interval_raw; + tm.tm_min = 0; + tm.tm_sec = 0; + break; + case GRN_TIME_CLASSIFY_UNIT_DAY : + tm.tm_hour = 0; + tm.tm_min = 0; + tm.tm_sec = 0; + break; + case GRN_TIME_CLASSIFY_UNIT_WEEK : + if ((tm.tm_mday - tm.tm_wday) >= 0) { + tm.tm_mday -= tm.tm_wday; + } else { + int n_underflowed_mday = -(tm.tm_mday - tm.tm_wday); + int mday; + int max_mday = 31; + + if (tm.tm_mon == 0) { + tm.tm_year--; + tm.tm_mon = 11; + } else { + tm.tm_mon--; + } + + for (mday = max_mday; mday > n_underflowed_mday; mday--) { + int64_t unused; + tm.tm_mday = mday; + if (grn_time_from_tm(ctx, &unused, &tm)) { + break; + } + } + tm.tm_mday -= n_underflowed_mday; + } + tm.tm_hour = 0; + tm.tm_min = 0; + tm.tm_sec = 0; + break; + case GRN_TIME_CLASSIFY_UNIT_MONTH : + tm.tm_mon = (tm.tm_mon / interval_raw) * interval_raw; + tm.tm_mday = 1; + tm.tm_hour = 0; + tm.tm_min = 0; + tm.tm_sec = 0; + break; + case GRN_TIME_CLASSIFY_UNIT_YEAR : + tm.tm_year = (((1900 + tm.tm_year) / interval_raw) * interval_raw) - 1900; + tm.tm_mon = 0; + tm.tm_mday = 1; + tm.tm_hour = 0; + tm.tm_min = 0; + tm.tm_sec = 0; + break; + } + + if (!grn_time_from_tm(ctx, &classed_time_raw, &tm)) { + return NULL; + } + + classed_time = grn_plugin_proc_alloc(ctx, + user_data, + time->header.domain, + 0); + if (!classed_time) { + return NULL; + } + GRN_TIME_SET(ctx, classed_time, classed_time_raw); + + return classed_time; + } +} + +static grn_obj * +func_time_classify_second(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_second", + GRN_TIME_CLASSIFY_UNIT_SECOND); +} + +static grn_obj * +func_time_classify_minute(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_minute", + GRN_TIME_CLASSIFY_UNIT_MINUTE); +} + +static grn_obj * +func_time_classify_hour(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_hour", + GRN_TIME_CLASSIFY_UNIT_HOUR); +} + +static grn_obj * +func_time_classify_day(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_day", + GRN_TIME_CLASSIFY_UNIT_DAY); +} + +static grn_obj * +func_time_classify_week(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_week", + GRN_TIME_CLASSIFY_UNIT_WEEK); +} + +static grn_obj * +func_time_classify_month(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_month", + GRN_TIME_CLASSIFY_UNIT_MONTH); +} + +static grn_obj * +func_time_classify_year(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + return func_time_classify_raw(ctx, + n_args, + args, + user_data, + "time_classify_year", + GRN_TIME_CLASSIFY_UNIT_YEAR); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc = GRN_SUCCESS; + + grn_proc_create(ctx, + "time_classify_second", -1, + GRN_PROC_FUNCTION, + func_time_classify_second, + NULL, NULL, 0, NULL); + grn_proc_create(ctx, + "time_classify_minute", -1, + GRN_PROC_FUNCTION, + func_time_classify_minute, + NULL, NULL, 0, NULL); + grn_proc_create(ctx, + "time_classify_hour", -1, + GRN_PROC_FUNCTION, + func_time_classify_hour, + NULL, NULL, 0, NULL); + grn_proc_create(ctx, + "time_classify_day", -1, + GRN_PROC_FUNCTION, + func_time_classify_day, + NULL, NULL, 0, NULL); + grn_proc_create(ctx, + "time_classify_week", -1, + GRN_PROC_FUNCTION, + func_time_classify_week, + NULL, NULL, 0, NULL); + grn_proc_create(ctx, + "time_classify_month", -1, + GRN_PROC_FUNCTION, + func_time_classify_month, + NULL, NULL, 0, NULL); + grn_proc_create(ctx, + "time_classify_year", -1, + GRN_PROC_FUNCTION, + func_time_classify_year, + NULL, NULL, 0, NULL); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/functions/time_sources.am b/storage/mroonga/vendor/groonga/plugins/functions/time_sources.am new file mode 100644 index 00000000..2c55a570 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/time_sources.am @@ -0,0 +1,2 @@ +time_la_SOURCES = \ + time.c diff --git a/storage/mroonga/vendor/groonga/plugins/functions/vector.c b/storage/mroonga/vendor/groonga/plugins/functions/vector.c new file mode 100644 index 00000000..1104b313 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/vector.c @@ -0,0 +1,401 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2015-2017 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG functions_vector +#endif + +#include <groonga/plugin.h> + +static grn_obj * +func_vector_size(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *target; + unsigned int size; + grn_obj *grn_size; + + if (n_args != 1) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "vector_size(): wrong number of arguments (%d for 1)", + n_args); + return NULL; + } + + target = args[0]; + switch (target->header.type) { + case GRN_VECTOR : + case GRN_PVECTOR : + case GRN_UVECTOR : + size = grn_vector_size(ctx, target); + break; + default : + { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, target, &inspected); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "vector_size(): target object must be vector: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + break; + } + + grn_size = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0); + if (!grn_size) { + return NULL; + } + + GRN_UINT32_SET(ctx, grn_size, size); + + return grn_size; +} + +static grn_obj * +func_vector_slice(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *target; + grn_obj *from_raw = NULL; + grn_obj *length_raw = NULL; + int64_t from = 0; + int64_t length = -1; + uint32_t to = 0; + uint32_t size = 0; + grn_obj *slice; + + if (n_args < 2 || n_args > 3) { + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "vector_slice(): wrong number of arguments (%d for 2..3)", + n_args); + return NULL; + } + + target = args[0]; + from_raw = args[1]; + if (n_args == 3) { + length_raw = args[2]; + } + switch (target->header.type) { + case GRN_VECTOR : + case GRN_PVECTOR : + case GRN_UVECTOR : + size = grn_vector_size(ctx, target); + break; + default : + { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, target, &inspected); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "vector_slice(): target object must be vector: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + break; + } + + if (!grn_type_id_is_number_family(ctx, from_raw->header.domain)) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, from_raw); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "vector_slice(): from must be a number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + if (from_raw->header.domain == GRN_DB_INT32) { + from = GRN_INT32_VALUE(from_raw); + } else if (from_raw->header.domain == GRN_DB_INT64) { + from = GRN_INT64_VALUE(from_raw); + } else { + grn_obj buffer; + grn_rc rc; + + GRN_INT64_INIT(&buffer, 0); + rc = grn_obj_cast(ctx, from_raw, &buffer, GRN_FALSE); + if (rc == GRN_SUCCESS) { + from = GRN_INT64_VALUE(&buffer); + } + GRN_OBJ_FIN(ctx, &buffer); + + if (rc != GRN_SUCCESS) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, from_raw); + GRN_PLUGIN_ERROR(ctx, rc, + "vector_slice(): " + "failed to cast from value to number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + } + + if (length_raw) { + if (!grn_type_id_is_number_family(ctx, length_raw->header.domain)) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, length_raw); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "vector_slice(): length must be a number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + if (length_raw->header.domain == GRN_DB_INT32) { + length = GRN_INT32_VALUE(length_raw); + } else if (length_raw->header.domain == GRN_DB_INT64) { + length = GRN_INT64_VALUE(length_raw); + } else { + grn_obj buffer; + grn_rc rc; + + GRN_INT64_INIT(&buffer, 0); + rc = grn_obj_cast(ctx, length_raw, &buffer, GRN_FALSE); + if (rc == GRN_SUCCESS) { + length = GRN_INT64_VALUE(&buffer); + } + GRN_OBJ_FIN(ctx, &buffer); + + if (rc != GRN_SUCCESS) { + grn_obj inspected; + + GRN_TEXT_INIT(&inspected, 0); + grn_inspect(ctx, &inspected, length_raw); + GRN_PLUGIN_ERROR(ctx, rc, + "vector_slice(): " + "failed to cast length value to number: <%.*s>", + (int)GRN_TEXT_LEN(&inspected), + GRN_TEXT_VALUE(&inspected)); + GRN_OBJ_FIN(ctx, &inspected); + return NULL; + } + } + } + + slice = grn_plugin_proc_alloc(ctx, user_data, target->header.domain, GRN_OBJ_VECTOR); + if (!slice) { + return NULL; + } + + if (target->header.flags & GRN_OBJ_WITH_WEIGHT) { + slice->header.flags |= GRN_OBJ_WITH_WEIGHT; + } + + if (length < 0) { + length = size + length + 1; + } + + if (length > size) { + length = size; + } + + if (length <= 0) { + return slice; + } + + while (from < 0) { + from += size; + } + + to = from + length; + if (to > size) { + to = size; + } + + switch (target->header.type) { + case GRN_VECTOR : + { + unsigned int i; + for (i = from; i < to; i++) { + const char *content; + unsigned int content_length; + unsigned int weight; + grn_id domain; + content_length = grn_vector_get_element(ctx, target, i, + &content, &weight, &domain); + grn_vector_add_element(ctx, slice, + content, content_length, weight, domain); + } + } + break; + case GRN_PVECTOR : + { + unsigned int i; + for (i = from; i < to; i++) { + grn_obj *element = GRN_PTR_VALUE_AT(target, i); + GRN_PTR_PUT(ctx, slice, element); + } + } + break; + case GRN_UVECTOR : + { + grn_obj *domain; + + domain = grn_ctx_at(ctx, target->header.domain); + if (grn_obj_is_table(ctx, domain)) { + unsigned int i; + for (i = from; i < to; i++) { + grn_id id; + unsigned int weight; + id = grn_uvector_get_element(ctx, target, i, &weight); + grn_uvector_add_element(ctx, slice, id, weight); + } + } else { +#define PUT_SLICE_VALUES(type) do { \ + unsigned int i; \ + for (i = from; i < to; i++) { \ + GRN_ ## type ## _PUT(ctx, \ + slice, \ + GRN_ ## type ## _VALUE_AT(target, i)); \ + } \ + } while (GRN_FALSE) + switch (target->header.domain) { + case GRN_DB_BOOL : + PUT_SLICE_VALUES(BOOL); + break; + case GRN_DB_INT8 : + PUT_SLICE_VALUES(INT8); + break; + case GRN_DB_UINT8 : + PUT_SLICE_VALUES(UINT8); + break; + case GRN_DB_INT16 : + PUT_SLICE_VALUES(INT16); + break; + case GRN_DB_UINT16 : + PUT_SLICE_VALUES(UINT16); + break; + case GRN_DB_INT32 : + PUT_SLICE_VALUES(INT32); + break; + case GRN_DB_UINT32 : + PUT_SLICE_VALUES(UINT32); + break; + case GRN_DB_INT64 : + PUT_SLICE_VALUES(INT64); + break; + case GRN_DB_UINT64 : + PUT_SLICE_VALUES(UINT64); + break; + case GRN_DB_FLOAT : + PUT_SLICE_VALUES(FLOAT); + break; + case GRN_DB_TIME : + PUT_SLICE_VALUES(TIME); + break; + } + } + } + break; +#undef PUT_SLICE_VALUES + } + + return slice; +} + +static grn_obj * +func_vector_new(grn_ctx *ctx, int n_args, grn_obj **args, + grn_user_data *user_data) +{ + grn_obj *vector = NULL; + int i; + + if (n_args == 0) { + return grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, GRN_OBJ_VECTOR); + } + + vector = grn_plugin_proc_alloc(ctx, + user_data, + args[0]->header.domain, + GRN_OBJ_VECTOR); + if (!vector) { + return NULL; + } + + for (i = 0; i < n_args; i++) { + grn_obj *element = args[i]; + switch (vector->header.type) { + case GRN_VECTOR : + grn_vector_add_element(ctx, + vector, + GRN_BULK_HEAD(element), + GRN_BULK_VSIZE(element), + 0, + element->header.domain); + break; + case GRN_UVECTOR : + grn_bulk_write(ctx, + vector, + GRN_BULK_HEAD(element), + GRN_BULK_VSIZE(element)); + break; + case GRN_PVECTOR : + GRN_PTR_PUT(ctx, vector, element); + break; + default : + break; + } + } + + return vector; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc = GRN_SUCCESS; + + grn_proc_create(ctx, "vector_size", -1, GRN_PROC_FUNCTION, func_vector_size, + NULL, NULL, 0, NULL); + + grn_proc_create(ctx, "vector_slice", -1, GRN_PROC_FUNCTION, func_vector_slice, + NULL, NULL, 0, NULL); + + grn_proc_create(ctx, "vector_new", -1, GRN_PROC_FUNCTION, func_vector_new, + NULL, NULL, 0, NULL); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/functions/vector_sources.am b/storage/mroonga/vendor/groonga/plugins/functions/vector_sources.am new file mode 100644 index 00000000..1d98e651 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/functions/vector_sources.am @@ -0,0 +1,2 @@ +vector_la_SOURCES = \ + vector.c diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt new file mode 100644 index 00000000..c2f04cb8 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/CMakeLists.txt @@ -0,0 +1,38 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(QUERY_EXPANDERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/query_expanders") +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/tsv_sources.am TSV_SOURCES) +set_source_files_properties(${TSV_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(tsv_query_expander STATIC ${TSV_SOURCES}) + set_target_properties( + tsv_query_expander + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(tsv_query_expander MODULE ${TSV_SOURCES}) + set_target_properties(tsv_query_expander PROPERTIES + PREFIX "" + OUTPUT_NAME "tsv") + install(TARGETS tsv_query_expander DESTINATION "${QUERY_EXPANDERS_DIR}") +endif() +target_link_libraries(tsv_query_expander libgroonga) diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am b/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am new file mode 100644 index 00000000..96c0911a --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/Makefile.am @@ -0,0 +1,20 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +query_expander_plugins_LTLIBRARIES = +query_expander_plugins_LTLIBRARIES += tsv.la + +include tsv_sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c new file mode 100644 index 00000000..5d5deec6 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv.c @@ -0,0 +1,314 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012-2015 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv +#endif + +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <groonga/plugin.h> + +#include <stdlib.h> +#include <string.h> + +#ifdef WIN32 +# include <windows.h> +# include <share.h> +#endif /* WIN32 */ + +#define MAX_SYNONYM_BYTES 4096 + +static grn_hash *synonyms = NULL; + +#ifdef WIN32 +static char win32_synonyms_file[MAX_PATH] = ""; +const char * +get_system_synonyms_file(void) +{ + if (win32_synonyms_file[0] == '\0') { + const char *base_dir; + const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE; + size_t base_dir_length; + + base_dir = grn_plugin_windows_base_dir(); + base_dir_length = strlen(base_dir); + grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir); + grn_strcat(win32_synonyms_file, MAX_PATH, "/"); + grn_strcat(win32_synonyms_file, MAX_PATH, relative_path); + } + return win32_synonyms_file; +} + +#else /* WIN32 */ +const char * +get_system_synonyms_file(void) +{ + return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE; +} +#endif /* WIN32 */ + +static grn_bool +is_comment_mark(char character) +{ + return character == '#'; +} + +static grn_encoding +detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length) +{ + grn_encoding encoding = GRN_ENC_NONE; + grn_obj null_terminated_line_buffer; + const char *c_line; + const char *coding_part_keyword = "coding: "; + const char *coding_part; + const char *encoding_name; + + GRN_TEXT_INIT(&null_terminated_line_buffer, 0); + GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length); + GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0'); + + c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer); + coding_part = strstr(c_line, coding_part_keyword); + if (coding_part) { + encoding_name = coding_part + strlen(coding_part_keyword); + if (grn_strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 || + grn_strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) { + encoding = GRN_ENC_UTF8; + } else if (grn_strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 || + grn_strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) { + encoding = GRN_ENC_SJIS; + } else if (grn_strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 || + grn_strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) { + encoding = GRN_ENC_EUC_JP; + } else if (grn_strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) { + encoding = GRN_ENC_LATIN1; + } else if (grn_strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 || + grn_strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) { + encoding = GRN_ENC_KOI8R; + } + } else { + encoding = ctx->encoding; + } + GRN_OBJ_FIN(ctx, &null_terminated_line_buffer); + + return encoding; +} + +static grn_encoding +guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length) +{ + const char bom[] = {0xef, 0xbb, 0xbf}; + size_t bom_length = sizeof(bom); + + if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) { + *line += bom_length; + *line_length -= bom_length; + return GRN_ENC_UTF8; + } + + if (!is_comment_mark((*line)[0])) { + return ctx->encoding; + } + + return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1); +} + +static void +parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length, + grn_obj *key, grn_obj *value) +{ + size_t i = 0; + + if (is_comment_mark(line[i])) { + return; + } + + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + break; + } + GRN_TEXT_PUTC(ctx, key, character); + } + + if (i == line_length) { + return; + } + + GRN_TEXT_PUTS(ctx, value, "(("); + while (i < line_length) { + char character = line[i]; + i++; + if (character == '\t') { + GRN_TEXT_PUTS(ctx, value, ") OR ("); + } else { + GRN_TEXT_PUTC(ctx, value, character); + } + } + GRN_TEXT_PUTS(ctx, value, "))"); + + { + grn_id id; + void *value_location = NULL; + + id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key), + &value_location, NULL); + if (id == GRN_ID_NIL) { + GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "failed to register key: <%.*s>", + (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key)); + return; + } + + if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) { + GRN_TEXT_PUTC(ctx, value, '\0'); + } else { + grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1); + GRN_TEXT_PUTC(ctx, value, '\0'); + } + grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value)); + } +} + +static void +load_synonyms(grn_ctx *ctx) +{ + static char path_env[GRN_ENV_BUFFER_SIZE]; + const char *path; + grn_file_reader *file_reader; + int number_of_lines; + grn_encoding encoding; + grn_obj line, key, value; + + grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE", + path_env, + GRN_ENV_BUFFER_SIZE); + if (path_env[0]) { + path = path_env; + } else { + path = get_system_synonyms_file(); + } + file_reader = grn_file_reader_open(ctx, path); + if (!file_reader) { + GRN_LOG(ctx, GRN_LOG_WARNING, + "[plugin][query-expander][tsv] " + "synonyms file doesn't exist: <%s>", + path); + return; + } + + GRN_TEXT_INIT(&line, 0); + GRN_TEXT_INIT(&key, 0); + GRN_TEXT_INIT(&value, 0); + grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES); + number_of_lines = 0; + while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) { + const char *line_value = GRN_TEXT_VALUE(&line); + size_t line_length = GRN_TEXT_LEN(&line); + + if (line_length > 0 && line_value[line_length - 1] == '\n') { + if (line_length > 1 && line_value[line_length - 2] == '\r') { + line_length -= 2; + } else { + line_length -= 1; + } + } + number_of_lines++; + if (number_of_lines == 1) { + encoding = guess_encoding(ctx, &line_value, &line_length); + } + GRN_BULK_REWIND(&key); + GRN_BULK_REWIND(&value); + parse_synonyms_file_line(ctx, line_value, line_length, &key, &value); + GRN_BULK_REWIND(&line); + } + GRN_OBJ_FIN(ctx, &line); + GRN_OBJ_FIN(ctx, &key); + GRN_OBJ_FIN(ctx, &value); + + grn_file_reader_close(ctx, file_reader); +} + +static grn_obj * +func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args, + grn_user_data *user_data) +{ + grn_rc rc = GRN_END_OF_DATA; + grn_id id; + grn_obj *term, *expanded_term; + void *value; + grn_obj *rc_object; + + term = args[0]; + expanded_term = args[1]; + id = grn_hash_get(ctx, synonyms, + GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term), + &value); + if (id != GRN_ID_NIL) { + const char *query = value; + GRN_TEXT_PUTS(ctx, expanded_term, query); + rc = GRN_SUCCESS; + } + + rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0); + if (rc_object) { + GRN_INT32_SET(ctx, rc_object, rc); + } + + return rc_object; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + if (!synonyms) { + synonyms = grn_hash_create(ctx, NULL, + GRN_TABLE_MAX_KEY_SIZE, + MAX_SYNONYM_BYTES, + GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE); + if (!synonyms) { + return ctx->rc; + } + load_synonyms(ctx); + } + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"), + GRN_PROC_FUNCTION, + func_query_expander_tsv, NULL, NULL, + 0, NULL); + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + if (synonyms) { + grn_hash_close(ctx, synonyms); + synonyms = NULL; + } + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am new file mode 100644 index 00000000..f1bdabed --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/query_expanders/tsv_sources.am @@ -0,0 +1,2 @@ +tsv_la_SOURCES = \ + tsv.c diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/ruby/CMakeLists.txt new file mode 100644 index 00000000..a2bcccd1 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright(C) 2013-2016 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +if(NOT GRN_EMBED) + if(GRN_WITH_MRUBY) + set(GRN_RELATIVE_RUBY_PLUGINS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/ruby") + + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/sources.am RUBY_SCRIPTS) + install(FILES ${RUBY_SCRIPTS} + DESTINATION "${GRN_RELATIVE_RUBY_PLUGINS_DIR}") + endif() +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/Makefile.am b/storage/mroonga/vendor/groonga/plugins/ruby/Makefile.am new file mode 100644 index 00000000..a4949727 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/Makefile.am @@ -0,0 +1,9 @@ +EXTRA_DIST = \ + CMakeLists.txt + +if WITH_MRUBY +dist_ruby_plugins_DATA = \ + $(ruby_scripts) +endif + +include sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/eval.rb b/storage/mroonga/vendor/groonga/plugins/ruby/eval.rb new file mode 100644 index 00000000..e7619cf2 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/eval.rb @@ -0,0 +1,36 @@ +module Groonga + module Ruby + class EvalCommand < Command + register("ruby_eval", + [ + "script", + ]) + + def run_body(input) + script = input[:script] + unless script.is_a?(String) + message = "script must be a string: <#{script.inspect}>" + raise Groonga::InvalidArgument, message + end + + eval_context = EvalContext.new + begin + result = eval_context.eval(script) + rescue Exception => error + writer.map("result", 1) do + writer.write("exception") + writer.map("exception", 1) do + writer.write("message") + writer.write(error.message) + end + end + else + writer.map("result", 1) do + writer.write("value") + writer.write(result) + end + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/ruby/sources.am b/storage/mroonga/vendor/groonga/plugins/ruby/sources.am new file mode 100644 index 00000000..f8938291 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby/sources.am @@ -0,0 +1,2 @@ +ruby_scripts = \ + eval.rb diff --git a/storage/mroonga/vendor/groonga/plugins/ruby_scripts.am b/storage/mroonga/vendor/groonga/plugins/ruby_scripts.am new file mode 100644 index 00000000..0262dbb9 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/ruby_scripts.am @@ -0,0 +1,2 @@ +ruby_scripts = \ + sharding.rb diff --git a/storage/mroonga/vendor/groonga/plugins/sharding.rb b/storage/mroonga/vendor/groonga/plugins/sharding.rb new file mode 100644 index 00000000..86401c1f --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding.rb @@ -0,0 +1,11 @@ +require "sharding/parameters" +require "sharding/range_expression_builder" +require "sharding/logical_enumerator" + +require "sharding/logical_parameters" + +require "sharding/logical_count" +require "sharding/logical_range_filter" +require "sharding/logical_select" +require "sharding/logical_shard_list" +require "sharding/logical_table_remove" diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/sharding/CMakeLists.txt new file mode 100644 index 00000000..1131520f --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright(C) 2015 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +if(NOT GRN_EMBED) + if(GRN_WITH_MRUBY) + set(GRN_RELATIVE_SHARDING_PLUGINS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/sharding") + + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/sources.am SHARDING_SCRIPTS) + install(FILES ${SHARDING_SCRIPTS} + DESTINATION "${GRN_RELATIVE_SHARDING_PLUGINS_DIR}") + endif() +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/Makefile.am b/storage/mroonga/vendor/groonga/plugins/sharding/Makefile.am new file mode 100644 index 00000000..8104ab6d --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/Makefile.am @@ -0,0 +1,9 @@ +EXTRA_DIST = \ + CMakeLists.txt + +if WITH_MRUBY +dist_sharding_plugins_DATA = \ + $(sharding_scripts) +endif + +include sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_count.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_count.rb new file mode 100644 index 00000000..8bdd77ef --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_count.rb @@ -0,0 +1,169 @@ +module Groonga + module Sharding + class LogicalCountCommand < Command + register("logical_count", + [ + "logical_table", + "shard_key", + "min", + "min_border", + "max", + "max_border", + "filter", + ]) + + def run_body(input) + enumerator = LogicalEnumerator.new("logical_count", input) + filter = input[:filter] + + total = 0 + enumerator.each do |shard, shard_range| + total += count_n_records(filter, shard, shard_range, + enumerator.target_range) + end + writer.write(total) + end + + private + def cache_key(input) + key = "logical_count\0" + key << "#{input[:logical_table]}\0" + key << "#{input[:shard_key]}\0" + key << "#{input[:min]}\0" + key << "#{input[:min_border]}\0" + key << "#{input[:max]}\0" + key << "#{input[:max_border]}\0" + key << "#{input[:filter]}\0" + key + end + + def log_use_range_index(use, table_name, line, method) + message = "[logical_count]" + if use + message << "[range-index]" + else + message << "[select]" + end + message << " <#{table_name}>" + Context.instance.logger.log(Logger::Level::DEBUG, + __FILE__, + line, + method.to_s, + message) + end + + def count_n_records(filter, shard, shard_range, target_range) + cover_type = target_range.cover_type(shard_range) + return 0 if cover_type == :none + + shard_key = shard.key + if shard_key.nil? + message = "[logical_count] shard_key doesn't exist: " + + "<#{shard.key_name}>" + raise InvalidArgument, message + end + table = shard.table + table_name = shard.table_name + + expression_builder = RangeExpressionBuilder.new(shard_key, + target_range) + expression_builder.filter = filter + if cover_type == :all + log_use_range_index(false, table_name, __LINE__, __method__) + if filter.nil? + return table.size + else + return filtered_count_n_records(table) do |expression| + expression_builder.build_all(expression) + end + end + end + + range_index = nil + if filter.nil? + index_info = shard_key.find_index(Operator::LESS) + if index_info + range_index = index_info.index + end + end + + use_range_index = (!range_index.nil?) + log_use_range_index(use_range_index, table_name, __LINE__, __method__) + + case cover_type + when :partial_min + if range_index + count_n_records_in_range(range_index, + target_range.min, target_range.min_border, + nil, nil) + else + filtered_count_n_records(table) do |expression| + expression_builder.build_partial_min(expression) + end + end + when :partial_max + if range_index + count_n_records_in_range(range_index, + nil, nil, + target_range.max, target_range.max_border) + else + filtered_count_n_records(table) do |expression| + expression_builder.build_partial_max(expression) + end + end + when :partial_min_and_max + if range_index + count_n_records_in_range(range_index, + target_range.min, target_range.min_border, + target_range.max, target_range.max_border) + else + filtered_count_n_records(table) do |expression| + expression_builder.build_partial_min_and_max(expression) + end + end + end + end + + def filtered_count_n_records(table) + expression = nil + filtered_table = nil + + begin + expression = Expression.create(table) + yield(expression) + filtered_table = table.select(expression) + filtered_table.size + ensure + filtered_table.close if filtered_table + expression.close if expression + end + end + + def count_n_records_in_range(range_index, + min, min_border, max, max_border) + flags = TableCursorFlags::BY_KEY + case min_border + when :include + flags |= TableCursorFlags::GE + when :exclude + flags |= TableCursorFlags::GT + end + case max_border + when :include + flags |= TableCursorFlags::LE + when :exclude + flags |= TableCursorFlags::LT + end + + TableCursor.open(range_index.table, + :min => min, + :max => max, + :flags => flags) do |table_cursor| + IndexCursor.open(table_cursor, range_index) do |index_cursor| + index_cursor.count + end + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_enumerator.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_enumerator.rb new file mode 100644 index 00000000..d05a220f --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_enumerator.rb @@ -0,0 +1,317 @@ +module Groonga + module Sharding + class LogicalEnumerator + include Enumerable + + attr_reader :target_range + attr_reader :logical_table + attr_reader :shard_key_name + def initialize(command_name, input, options={}) + @command_name = command_name + @input = input + @options = options + initialize_parameters + end + + def each(&block) + each_internal(:ascending, &block) + end + + def reverse_each(&block) + each_internal(:descending, &block) + end + + private + def each_internal(order) + context = Context.instance + each_shard_with_around(order) do |prev_shard, current_shard, next_shard| + shard_range_data = current_shard.range_data + shard_range = nil + + if shard_range_data.day.nil? + if order == :ascending + if next_shard + next_shard_range_data = next_shard.range_data + else + next_shard_range_data = nil + end + else + if prev_shard + next_shard_range_data = prev_shard.range_data + else + next_shard_range_data = nil + end + end + max_day = compute_month_shard_max_day(shard_range_data.year, + shard_range_data.month, + next_shard_range_data) + shard_range = MonthShardRange.new(shard_range_data.year, + shard_range_data.month, + max_day) + else + shard_range = DayShardRange.new(shard_range_data.year, + shard_range_data.month, + shard_range_data.day) + end + + yield(current_shard, shard_range) + end + end + + def each_shard_with_around(order) + context = Context.instance + prefix = "#{@logical_table}_" + + shards = [nil] + context.database.each_name(:prefix => prefix, + :order_by => :key, + :order => order) do |name| + shard_range_raw = name[prefix.size..-1] + + case shard_range_raw + when /\A(\d{4})(\d{2})\z/ + shard_range_data = ShardRangeData.new($1.to_i, $2.to_i, nil) + when /\A(\d{4})(\d{2})(\d{2})\z/ + shard_range_data = ShardRangeData.new($1.to_i, $2.to_i, $3.to_i) + else + next + end + + shards << Shard.new(name, @shard_key_name, shard_range_data) + next if shards.size < 3 + yield(*shards) + shards.shift + end + + if shards.size == 2 + yield(shards[0], shards[1], nil) + end + end + + private + def initialize_parameters + @logical_table = @input[:logical_table] + if @logical_table.nil? + raise InvalidArgument, "[#{@command_name}] logical_table is missing" + end + + @shard_key_name = @input[:shard_key] + if @shard_key_name.nil? + require_shard_key = @options[:require_shard_key] + require_shard_key = true if require_shard_key.nil? + if require_shard_key + raise InvalidArgument, "[#{@command_name}] shard_key is missing" + end + end + + @target_range = TargetRange.new(@command_name, @input) + end + + def compute_month_shard_max_day(year, month, next_shard_range) + return nil if next_shard_range.nil? + + return nil if month != next_shard_range.month + + next_shard_range.day + end + + class Shard + attr_reader :table_name, :key_name, :range_data + def initialize(table_name, key_name, range_data) + @table_name = table_name + @key_name = key_name + @range_data = range_data + end + + def table + @table ||= Context.instance[@table_name] + end + + def full_key_name + "#{@table_name}.#{@key_name}" + end + + def key + @key ||= Context.instance[full_key_name] + end + end + + class ShardRangeData + attr_reader :year, :month, :day + def initialize(year, month, day) + @year = year + @month = month + @day = day + end + + def to_suffix + if @day.nil? + "_%04d%02d" % [@year, @month] + else + "_%04d%02d%02d" % [@year, @month, @day] + end + end + end + + class DayShardRange + attr_reader :year, :month, :day + def initialize(year, month, day) + @year = year + @month = month + @day = day + end + + def least_over_time + next_day = Time.local(@year, @month, @day) + (60 * 60 * 24) + while next_day.day == @day # For leap second + next_day += 1 + end + next_day + end + + def min_time + Time.local(@year, @month, @day) + end + + def include?(time) + @year == time.year and + @month == time.month and + @day == time.day + end + end + + class MonthShardRange + attr_reader :year, :month, :max_day + def initialize(year, month, max_day) + @year = year + @month = month + @max_day = max_day + end + + def least_over_time + if @max_day.nil? + if @month == 12 + Time.local(@year + 1, 1, 1) + else + Time.local(@year, @month + 1, 1) + end + else + Time.local(@year, @month, @max_day) + end + end + + def min_time + Time.local(@year, @month, 1) + end + + def include?(time) + return false unless @year == time.year + return false unless @month == time.month + + if @max_day.nil? + true + else + time.day <= @max_day + end + end + end + + class TargetRange + attr_reader :min, :min_border + attr_reader :max, :max_border + def initialize(command_name, input) + @command_name = command_name + @input = input + @min = parse_value(:min) + @min_border = parse_border(:min_border) + @max = parse_value(:max) + @max_border = parse_border(:max_border) + end + + def cover_type(shard_range) + return :all if @min.nil? and @max.nil? + + if @min and @max + return :none unless in_min?(shard_range) + return :none unless in_max?(shard_range) + min_partial_p = in_min_partial?(shard_range) + max_partial_p = in_max_partial?(shard_range) + if min_partial_p and max_partial_p + :partial_min_and_max + elsif min_partial_p + :partial_min + elsif max_partial_p + :partial_max + else + :all + end + elsif @min + return :none unless in_min?(shard_range) + if in_min_partial?(shard_range) + :partial_min + else + :all + end + else + return :none unless in_max?(shard_range) + if in_max_partial?(shard_range) + :partial_max + else + :all + end + end + end + + private + def parse_value(name) + value = @input[name] + return nil if value.nil? + + Converter.convert(value, Time) + end + + def parse_border(name) + border = @input[name] + return :include if border.nil? + + case border + when "include" + :include + when "exclude" + :exclude + else + message = + "[#{@command_name}] #{name} must be \"include\" or \"exclude\": " + + "<#{border}>" + raise InvalidArgument, message + end + end + + def in_min?(shard_range) + @min < shard_range.least_over_time + end + + def in_min_partial?(shard_range) + return false unless shard_range.include?(@min) + + return true if @min_border == :exclude + + shard_range.min_time != @min + end + + def in_max?(shard_range) + max_base_time = shard_range.min_time + if @max_border == :include + @max >= max_base_time + else + @max > max_base_time + end + end + + def in_max_partial?(shard_range) + shard_range.include?(@max) + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_parameters.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_parameters.rb new file mode 100644 index 00000000..75ff569b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_parameters.rb @@ -0,0 +1,44 @@ +module Groonga + module Sharding + class LogicalParametersCommand < Command + register("logical_parameters", + [ + "range_index", + ]) + + def run_body(input) + range_index = parse_range_index(input[:range_index]) + + parameters = [ + :range_index, + ] + writer.map("parameters", parameters.size) do + parameters.each do |name| + writer.write(name.to_s) + writer.write(Parameters.__send__(name)) + end + end + + Parameters.range_index = range_index if range_index + end + + private + def parse_range_index(value) + case value + when nil + nil + when "auto" + :auto + when "always" + :always + when "never" + :never + else + message = "[logical_parameters][range_index] " + message << "must be auto, always or never: <#{value}>" + raise InvalidArgument, message + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb new file mode 100644 index 00000000..1c8f8644 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_range_filter.rb @@ -0,0 +1,642 @@ +module Groonga + module Sharding + class LogicalRangeFilterCommand < Command + register("logical_range_filter", + [ + "logical_table", + "shard_key", + "min", + "min_border", + "max", + "max_border", + "order", + "filter", + "offset", + "limit", + "output_columns", + "use_range_index", + ]) + + def run_body(input) + output_columns = input[:output_columns] || "_key, *" + + context = ExecuteContext.new(input) + begin + executor = Executor.new(context) + executor.execute + + result_sets = context.result_sets + n_elements = 1 # for columns + result_sets.each do |result_set| + n_elements += result_set.size + end + + writer.array("RESULTSET", n_elements) do + first_result_set = result_sets.first + if first_result_set + writer.write_table_columns(first_result_set, output_columns) + end + limit = context.limit + if limit < 0 + n_records = result_sets.inject(0) do |n, result_set| + n + result_set.size + end + limit = n_records + limit + 1 + end + options = {} + result_sets.each do |result_set| + options[:limit] = limit + writer.write_table_records(result_set, output_columns, options) + limit -= result_set.size + break if limit <= 0 + end + end + ensure + context.close + end + end + + private + def cache_key(input) + key = "logical_range_filter\0" + key << "#{input[:logical_table]}\0" + key << "#{input[:shard_key]}\0" + key << "#{input[:min]}\0" + key << "#{input[:min_border]}\0" + key << "#{input[:max]}\0" + key << "#{input[:max_border]}\0" + key << "#{input[:order]}\0" + key << "#{input[:filter]}\0" + key << "#{input[:offset]}\0" + key << "#{input[:limit]}\0" + key << "#{input[:output_columns]}\0" + key << "#{input[:use_range_index]}\0" + key + end + + class ExecuteContext + attr_reader :use_range_index + attr_reader :enumerator + attr_reader :order + attr_reader :filter + attr_reader :offset + attr_reader :limit + attr_accessor :current_offset + attr_accessor :current_limit + attr_reader :result_sets + attr_reader :unsorted_result_sets + attr_reader :threshold + def initialize(input) + @input = input + @use_range_index = parse_use_range_index(@input[:use_range_index]) + @enumerator = LogicalEnumerator.new("logical_range_filter", @input) + @order = parse_order(@input, :order) + @filter = @input[:filter] + @offset = (@input[:offset] || 0).to_i + @limit = (@input[:limit] || 10).to_i + + @current_offset = @offset + @current_limit = @limit + + @result_sets = [] + @unsorted_result_sets = [] + + @threshold = compute_threshold + end + + def close + @unsorted_result_sets.each do |result_set| + result_set.close if result_set.temporary? + end + @result_sets.each do |result_set| + result_set.close if result_set.temporary? + end + end + + private + def parse_use_range_index(use_range_index) + case use_range_index + when "yes" + true + when "no" + false + else + nil + end + end + + def parse_order(input, name) + order = input[name] + return :ascending if order.nil? + + case order + when "ascending" + :ascending + when "descending" + :descending + else + message = + "[logical_range_filter] #{name} must be " + + "\"ascending\" or \"descending\": <#{order}>" + raise InvalidArgument, message + end + end + + def compute_threshold + threshold_env = ENV["GRN_LOGICAL_RANGE_FILTER_THRESHOLD"] + default_threshold = 0.2 + (threshold_env || default_threshold).to_f + end + end + + class Executor + def initialize(context) + @context = context + end + + def execute + first_shard = nil + enumerator = @context.enumerator + target_range = enumerator.target_range + if @context.order == :descending + each_method = :reverse_each + else + each_method = :each + end + enumerator.send(each_method) do |shard, shard_range| + first_shard ||= shard + shard_executor = ShardExecutor.new(@context, shard, shard_range) + shard_executor.execute + break if @context.current_limit == 0 + end + if first_shard.nil? + message = + "[logical_range_filter] no shard exists: " + + "logical_table: <#{enumerator.logical_table}>: " + + "shard_key: <#{enumerator.shard_key_name}>" + raise InvalidArgument, message + end + if @context.result_sets.empty? + result_set = HashTable.create(:flags => ObjectFlags::WITH_SUBREC, + :key_type => first_shard.table) + @context.result_sets << result_set + end + end + end + + class ShardExecutor + def initialize(context, shard, shard_range) + @context = context + @shard = shard + @shard_range = shard_range + + @filter = @context.filter + @result_sets = @context.result_sets + @unsorted_result_sets = @context.unsorted_result_sets + + @target_range = @context.enumerator.target_range + + @cover_type = @target_range.cover_type(@shard_range) + end + + def execute + return if @cover_type == :none + return if @shard.table.empty? + + shard_key = @shard.key + if shard_key.nil? + message = "[logical_range_filter] shard_key doesn't exist: " + + "<#{@shard.key_name}>" + raise InvalidArgument, message + end + + expression_builder = RangeExpressionBuilder.new(shard_key, + @target_range) + expression_builder.filter = @filter + + index_info = shard_key.find_index(Operator::LESS) + if index_info + range_index = index_info.index + unless use_range_index?(range_index, expression_builder) + range_index = nil + end + else + range_index = nil + end + + execute_filter(range_index, expression_builder) + end + + private + def decide_use_range_index(use, reason, line, method) + message = "[logical_range_filter]" + if use + message << "[range-index] " + else + message << "[select] " + end + message << "<#{@shard.table_name}>: " + message << reason + Context.instance.logger.log(Logger::Level::DEBUG, + __FILE__, + line, + method.to_s, + message) + + use + end + + def use_range_index?(range_index, expression_builder) + use_range_index_parameter_message = + "force by use_range_index parameter" + case @context.use_range_index + when true + return decide_use_range_index(true, + use_range_index_parameter_message, + __LINE__, __method__) + when false + return decide_use_range_index(false, + use_range_index_parameter_message, + __LINE__, __method__) + end + + range_index_logical_parameter_message = + "force by range_index logical parameter" + case Parameters.range_index + when :always + return decide_use_range_index(true, + range_index_logical_parameter_message, + __LINE__, __method__) + when :never + return decide_use_range_index(false, + range_index_logical_parameter_message, + __LINE__, __method__) + end + + current_limit = @context.current_limit + if current_limit < 0 + reason = "limit is negative: <#{current_limit}>" + return decide_use_range_index(false, reason, + __LINE__, __method__) + end + + required_n_records = @context.current_offset + current_limit + max_n_records = @shard.table.size + if max_n_records <= required_n_records + reason = "the number of required records (#{required_n_records}) " + reason << ">= " + reason << "the number of records in shard (#{max_n_records})" + return decide_use_range_index(false, reason, + __LINE__, __method__) + end + + threshold = @context.threshold + if threshold <= 0.0 + reason = "threshold is negative: <#{threshold}>" + return decide_use_range_index(true, reason, + __LINE__, __method__) + end + if threshold >= 1.0 + reason = "threshold (#{threshold}) >= 1.0" + return decide_use_range_index(false, reason, + __LINE__, __method__) + end + + table = @shard.table + estimated_n_records = 0 + case @cover_type + when :all + if @filter + create_expression(table) do |expression| + expression_builder.build_all(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) + end + else + estimated_n_records = max_n_records + end + when :partial_min + create_expression(table) do |expression| + expression_builder.build_partial_min(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) + end + when :partial_max + create_expression(table) do |expression| + expression_builder.build_partial_max(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) + end + when :partial_min_and_max + create_expression(table) do |expression| + expression_builder.build_partial_min_and_max(expression) + unless range_index_available_expression?(expression, + __LINE__, __method__) + return false + end + estimated_n_records = expression.estimate_size(table) + end + end + + if estimated_n_records <= required_n_records + reason = "the number of required records (#{required_n_records}) " + reason << ">= " + reason << "the number of estimated records (#{estimated_n_records})" + return decide_use_range_index(false, reason, + __LINE__, __method__) + end + + hit_ratio = estimated_n_records / max_n_records.to_f + use_range_index_by_hit_ratio = (hit_ratio >= threshold) + if use_range_index_by_hit_ratio + relation = ">=" + else + relation = "<" + end + reason = "hit ratio " + reason << "(#{hit_ratio}=#{estimated_n_records}/#{max_n_records}) " + reason << "#{relation} threshold (#{threshold})" + decide_use_range_index(use_range_index_by_hit_ratio, reason, + __LINE__, __method__) + end + + def range_index_available_expression?(expression, line, method_name) + nested_reference_vector_column_accessor = + find_nested_reference_vector_column_accessor(expression) + if nested_reference_vector_column_accessor + reason = "nested reference vector column accessor can't be used: " + reason << "<#{nested_reference_vector_column_accessor.name}>" + return decide_use_range_index(false, reason, line, method_name) + end + + selector_only_procedure = find_selector_only_procedure(expression) + if selector_only_procedure + reason = "selector only procedure can't be used: " + reason << "<#{selector_only_procedure.name}>" + return decide_use_range_index(false, reason, line, method_name) + end + + true + end + + def find_nested_reference_vector_column_accessor(expression) + expression.codes.each do |code| + value = code.value + next unless value.is_a?(Accessor) + + sub_accessor = value + while sub_accessor.have_next? + object = sub_accessor.object + return value if object.is_a?(Column) and object.vector? + sub_accessor = sub_accessor.next + end + end + nil + end + + def find_selector_only_procedure(expression) + expression.codes.each do |code| + value = code.value + return value if value.is_a?(Procedure) and value.selector_only? + end + nil + end + + def execute_filter(range_index, expression_builder) + case @cover_type + when :all + filter_shard_all(range_index, expression_builder) + when :partial_min + if range_index + filter_by_range(range_index, expression_builder, + @target_range.min, @target_range.min_border, + nil, nil) + else + filter_table do |expression| + expression_builder.build_partial_min(expression) + end + end + when :partial_max + if range_index + filter_by_range(range_index, expression_builder, + nil, nil, + @target_range.max, @target_range.max_border) + else + filter_table do |expression| + expression_builder.build_partial_max(expression) + end + end + when :partial_min_and_max + if range_index + filter_by_range(range_index, expression_builder, + @target_range.min, @target_range.min_border, + @target_range.max, @target_range.max_border) + else + filter_table do |expression| + expression_builder.build_partial_min_and_max(expression) + end + end + end + end + + def filter_shard_all(range_index, expression_builder) + table = @shard.table + if @filter.nil? + if table.size <= @context.current_offset + @context.current_offset -= table.size + return + end + if range_index + filter_by_range(range_index, expression_builder, + nil, nil, + nil, nil) + else + sort_result_set(table) + end + else + if range_index + filter_by_range(range_index, expression_builder, + nil, nil, + nil, nil) + else + filter_table do |expression| + expression_builder.build_all(expression) + end + end + end + end + + def create_expression(table) + expression = Expression.create(table) + begin + yield(expression) + ensure + expression.close + end + end + + def filter_by_range(range_index, expression_builder, + min, min_border, max, max_border) + lexicon = range_index.domain + data_table = range_index.range + flags = build_range_search_flags(min_border, max_border) + + result_set = HashTable.create(:flags => ObjectFlags::WITH_SUBREC, + :key_type => data_table) + n_matched_records = 0 + begin + TableCursor.open(lexicon, + :min => min, + :max => max, + :flags => flags) do |table_cursor| + options = { + :offset => @context.current_offset, + } + current_limit = @context.current_limit + if current_limit < 0 + options[:limit] = data_table.size + else + options[:limit] = current_limit + end + max_n_unmatched_records = + compute_max_n_unmatched_records(data_table.size, + options[:limit]) + options[:max_n_unmatched_records] = max_n_unmatched_records + if @filter + create_expression(data_table) do |expression| + expression.parse(@filter) + options[:expression] = expression + IndexCursor.open(table_cursor, range_index) do |index_cursor| + n_matched_records = index_cursor.select(result_set, options) + end + end + else + IndexCursor.open(table_cursor, range_index) do |index_cursor| + n_matched_records = index_cursor.select(result_set, options) + end + end + if n_matched_records == -1 + result_set.close + fallback_message = + "fallback because there are too much unmatched records: " + fallback_message << "<#{max_n_unmatched_records}>" + decide_use_range_index(false, + fallback_message, + __LINE__, __method__) + execute_filter(nil, expression_builder) + return + end + end + rescue + result_set.close + raise + end + + if n_matched_records <= @context.current_offset + @context.current_offset -= n_matched_records + result_set.close + return + end + + if @context.current_offset > 0 + @context.current_offset = 0 + end + if @context.current_limit > 0 + @context.current_limit -= result_set.size + end + @result_sets << result_set + end + + def build_range_search_flags(min_border, max_border) + flags = TableCursorFlags::BY_KEY + case @context.order + when :ascending + flags |= TableCursorFlags::ASCENDING + when :descending + flags |= TableCursorFlags::DESCENDING + end + case min_border + when :include + flags |= TableCursorFlags::GE + when :exclude + flags |= TableCursorFlags::GT + end + case max_border + when :include + flags |= TableCursorFlags::LE + when :exclude + flags |= TableCursorFlags::LT + end + flags + end + + def compute_max_n_unmatched_records(data_table_size, limit) + max_n_unmatched_records = limit * 100 + max_n_sample_records = data_table_size + if max_n_sample_records > 10000 + sample_ratio = 1 / (Math.log(data_table_size) ** 2) + max_n_sample_records = (max_n_sample_records * sample_ratio).ceil + end + if max_n_unmatched_records > max_n_sample_records + max_n_unmatched_records = max_n_sample_records + end + max_n_unmatched_records + end + + def filter_table + table = @shard.table + create_expression(table) do |expression| + yield(expression) + result_set = table.select(expression) + sort_result_set(result_set) + end + end + + def sort_result_set(result_set) + if result_set.empty? + result_set.close if result_set.temporary? + return + end + + if result_set.size <= @context.current_offset + @context.current_offset -= result_set.size + result_set.close if result_set.temporary? + return + end + + @unsorted_result_sets << result_set if result_set.temporary? + sort_keys = [ + { + :key => @context.enumerator.shard_key_name, + :order => @context.order, + }, + ] + if @context.current_limit > 0 + limit = @context.current_limit + else + limit = result_set.size + end + sorted_result_set = result_set.sort(sort_keys, + :offset => @context.current_offset, + :limit => limit) + @result_sets << sorted_result_set + if @context.current_offset > 0 + @context.current_offset = 0 + end + if @context.current_limit > 0 + @context.current_limit -= sorted_result_set.size + end + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_select.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_select.rb new file mode 100644 index 00000000..07ebf9e8 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_select.rb @@ -0,0 +1,975 @@ +module Groonga + module Sharding + class LogicalSelectCommand < Command + register("logical_select", + [ + "logical_table", + "shard_key", + "min", + "min_border", + "max", + "max_border", + "filter", + # Deprecated since 6.1.5. Use sort_keys instead. + "sortby", + "output_columns", + "offset", + "limit", + "drilldown", + # Deprecated since 6.1.5. Use drilldown_sort_keys instead. + "drilldown_sortby", + "drilldown_output_columns", + "drilldown_offset", + "drilldown_limit", + "drilldown_calc_types", + "drilldown_calc_target", + "sort_keys", + "drilldown_sort_keys", + "match_columns", + "query", + "drilldown_filter", + ]) + + def run_body(input) + context = ExecuteContext.new(input) + begin + executor = Executor.new(context) + executor.execute + + n_results = 1 + n_plain_drilldowns = context.plain_drilldown.n_result_sets + n_labeled_drilldowns = context.labeled_drilldowns.n_result_sets + if n_plain_drilldowns > 0 + n_results += n_plain_drilldowns + elsif + if n_labeled_drilldowns > 0 + n_results += 1 + end + end + + writer.array("RESULT", n_results) do + write_records(writer, context) + if n_plain_drilldowns > 0 + write_plain_drilldowns(writer, context) + elsif n_labeled_drilldowns > 0 + write_labeled_drilldowns(writer, context) + end + end + ensure + context.close + end + end + + private + def cache_key(input) + sort_keys = input[:sort_keys] || input[:sortby] + drilldown_sort_keys = + input[:drilldown_sort_keys] || input[:drilldown_sortby] + key = "logical_select\0" + key << "#{input[:logical_table]}\0" + key << "#{input[:shard_key]}\0" + key << "#{input[:min]}\0" + key << "#{input[:min_border]}\0" + key << "#{input[:max]}\0" + key << "#{input[:max_border]}\0" + key << "#{input[:filter]}\0" + key << "#{sort_keys}\0" + key << "#{input[:output_columns]}\0" + key << "#{input[:offset]}\0" + key << "#{input[:limit]}\0" + key << "#{input[:drilldown]}\0" + key << "#{drilldown_sort_keys}\0" + key << "#{input[:match_columns]}\0" + key << "#{input[:query]}\0" + key << "#{input[:drilldown_output_columns]}\0" + key << "#{input[:drilldown_offset]}\0" + key << "#{input[:drilldown_limit]}\0" + key << "#{input[:drilldown_calc_types]}\0" + key << "#{input[:drilldown_calc_target]}\0" + key << "#{input[:drilldown_filter]}\0" + labeled_drilldowns = LabeledDrilldowns.parse(input).sort_by(&:label) + labeled_drilldowns.each do |drilldown| + key << "#{drilldown.label}\0" + key << "#{drilldown.keys.join(',')}\0" + key << "#{drilldown.output_columns}\0" + key << "#{drilldown.offset}\0" + key << "#{drilldown.limit}\0" + key << "#{drilldown.calc_types}\0" + key << "#{drilldown.calc_target_name}\0" + key << "#{drilldown.filter}\0" + cache_key_dynamic_columns(key, drilldown.dynamic_columns) + end + dynamic_columns = DynamicColumns.parse(input) + cache_key_dynamic_columns(key, dynamic_columns) + key + end + + def cache_key_dynamic_columns(key, dynamic_columns) + [ + :initial, + :filtered, + :output + ].each do |stage| + target_dynamic_columns = dynamic_columns.__send__("each_#{stage}").to_a + target_dynamic_columns.sort_by(&:label).each do |dynamic_column| + key << "#{dynamic_column.label}\0" + key << "#{dynamic_column.stage}\0" + key << "#{dynamic_column.type}\0" + key << "#{dynamic_column.flags}\0" + key << "#{dynamic_column.value}\0" + key << "#{dynamic_column.window_sort_keys.join(',')}\0" + key << "#{dynamic_column.window_group_keys.join(',')}\0" + end + end + end + + def write_records(writer, context) + result_sets = context.result_sets + + n_hits = 0 + n_elements = 2 # for N hits and columns + result_sets.each do |result_set| + n_hits += result_set.size + n_elements += result_set.size + end + + output_columns = context.output_columns + + writer.array("RESULTSET", n_elements) do + writer.array("NHITS", 1) do + writer.write(n_hits) + end + first_result_set = result_sets.first + if first_result_set + writer.write_table_columns(first_result_set, output_columns) + end + + current_offset = context.offset + current_offset += n_hits if current_offset < 0 + current_limit = context.limit + current_limit += n_hits + 1 if current_limit < 0 + options = { + :offset => current_offset, + :limit => current_limit, + } + result_sets.each do |result_set| + if result_set.size > current_offset + writer.write_table_records(result_set, output_columns, options) + current_limit -= result_set.size + end + if current_offset > 0 + current_offset = [current_offset - result_set.size, 0].max + end + break if current_limit <= 0 + options[:offset] = current_offset + options[:limit] = current_limit + end + end + end + + def write_plain_drilldowns(writer, execute_context) + plain_drilldown = execute_context.plain_drilldown + + drilldowns = plain_drilldown.result_sets + output_columns = plain_drilldown.output_columns + options = { + :offset => plain_drilldown.offset, + :limit => plain_drilldown.limit, + } + + drilldowns.each do |drilldown| + n_elements = 2 # for N hits and columns + n_elements += drilldown.size + writer.array("RESULTSET", n_elements) do + writer.array("NHITS", 1) do + writer.write(drilldown.size) + end + writer.write_table_columns(drilldown, output_columns) + writer.write_table_records(drilldown, output_columns, + options) + end + end + end + + def write_labeled_drilldowns(writer, execute_context) + labeled_drilldowns = execute_context.labeled_drilldowns + is_command_version1 = (context.command_version == 1) + + writer.map("DRILLDOWNS", labeled_drilldowns.n_result_sets) do + labeled_drilldowns.each do |drilldown| + writer.write(drilldown.label) + + result_set = drilldown.result_set + n_elements = 2 # for N hits and columns + n_elements += result_set.size + output_columns = drilldown.output_columns + options = { + :offset => drilldown.offset, + :limit => drilldown.limit, + } + + writer.array("RESULTSET", n_elements) do + writer.array("NHITS", 1) do + writer.write(result_set.size) + end + writer.write_table_columns(result_set, output_columns) + if is_command_version1 and drilldown.need_command_version2? + context.with_command_version(2) do + writer.write_table_records(result_set, + drilldown.output_columns_v2, + options) + end + else + writer.write_table_records(result_set, output_columns, options) + end + end + end + end + end + + class LabeledArgumentParser + def initialize(parameters) + @parameters = parameters + end + + def parse(prefix_pattern) + pattern = /\A#{prefix_pattern}\[(.+?)\]\.(.+)\z/ + labeled_arguments = {} + @parameters.each do |key, value| + match_data = pattern.match(key) + next if match_data.nil? + labeled_argument = (labeled_arguments[match_data[1]] ||= {}) + labeled_argument[match_data[2]] = value + end + labeled_arguments + end + end + + module KeysParsable + private + def parse_keys(raw_keys) + return [] if raw_keys.nil? + + raw_keys.strip.split(/ *, */) + end + end + + module Calculatable + def calc_target(table) + return nil if @calc_target_name.nil? + table.find_column(@calc_target_name) + end + + private + def parse_calc_types(raw_types) + return TableGroupFlags::CALC_COUNT if raw_types.nil? + + types = 0 + raw_types.strip.split(/ *, */).each do |name| + case name + when "COUNT" + types |= TableGroupFlags::CALC_COUNT + when "MAX" + types |= TableGroupFlags::CALC_MAX + when "MIN" + types |= TableGroupFlags::CALC_MIN + when "SUM" + types |= TableGroupFlags::CALC_SUM + when "AVG" + types |= TableGroupFlags::CALC_AVG + when "NONE" + # Do nothing + else + raise InvalidArgument, "invalid drilldown calc type: <#{name}>" + end + end + types + end + end + + class ExecuteContext + include KeysParsable + + attr_reader :enumerator + attr_reader :match_columns + attr_reader :query + attr_reader :filter + attr_reader :offset + attr_reader :limit + attr_reader :sort_keys + attr_reader :output_columns + attr_reader :dynamic_columns + attr_reader :result_sets + attr_reader :unsorted_result_sets + attr_reader :plain_drilldown + attr_reader :labeled_drilldowns + attr_reader :temporary_tables + attr_reader :expressions + def initialize(input) + @input = input + @enumerator = LogicalEnumerator.new("logical_select", @input) + @match_columns = @input[:match_columns] + @query = @input[:query] + @filter = @input[:filter] + @offset = (@input[:offset] || 0).to_i + @limit = (@input[:limit] || 10).to_i + @sort_keys = parse_keys(@input[:sort_keys] || @input[:sortby]) + @output_columns = @input[:output_columns] || "_id, _key, *" + + @dynamic_columns = DynamicColumns.parse(@input) + + @result_sets = [] + @unsorted_result_sets = [] + + @plain_drilldown = PlainDrilldownExecuteContext.new(@input) + @labeled_drilldowns = LabeledDrilldowns.parse(@input) + + @temporary_tables = [] + + @expressions = [] + end + + def close + @result_sets.each do |result_set| + result_set.close if result_set.temporary? + end + @unsorted_result_sets.each do |result_set| + result_set.close if result_set.temporary? + end + + @plain_drilldown.close + @labeled_drilldowns.close + + @dynamic_columns.close + + @temporary_tables.each do |table| + table.close + end + + @expressions.each do |expression| + expression.close + end + end + end + + class DynamicColumns + class << self + def parse(input) + parser = LabeledArgumentParser.new(input) + columns = parser.parse(/columns?/) + + initial_contexts = [] + filtered_contexts = [] + output_contexts = [] + columns.each do |label, parameters| + contexts = nil + case parameters["stage"] + when "initial" + contexts = initial_contexts + when "filtered" + contexts = filtered_contexts + when "output" + contexts = output_contexts + else + next + end + contexts << DynamicColumnExecuteContext.new(label, parameters) + end + + new(initial_contexts, + filtered_contexts, + output_contexts) + end + end + + def initialize(initial_contexts, + filtered_contexts, + output_contexts) + @initial_contexts = initial_contexts + @filtered_contexts = filtered_contexts + @output_contexts = output_contexts + end + + def each_initial(&block) + @initial_contexts.each(&block) + end + + def each_filtered(&block) + @filtered_contexts.each(&block) + end + + def each_output(&block) + @output_contexts.each(&block) + end + + def close + @initial_contexts.each do |context| + context.close + end + @filtered_contexts.each do |context| + context.close + end + @output_contexts.each do |context| + context.close + end + end + end + + class DynamicColumnExecuteContext + include KeysParsable + + attr_reader :label + attr_reader :stage + attr_reader :type + attr_reader :flags + attr_reader :value + attr_reader :window_sort_keys + attr_reader :window_group_keys + def initialize(label, parameters) + @label = label + @stage = parameters["stage"] + @type = parse_type(parameters["type"]) + @flags = parse_flags(parameters["flags"] || "COLUMN_SCALAR") + @value = parameters["value"] + @window_sort_keys = parse_keys(parameters["window.sort_keys"]) + @window_group_keys = parse_keys(parameters["window.group_keys"]) + end + + def close + end + + def apply(table, condition=nil) + column = table.create_column(@label, @flags, @type) + return if table.empty? + + expression = Expression.create(table) + begin + expression.parse(@value) + if @window_sort_keys.empty? and @window_group_keys.empty? + expression.condition = condition if condition + table.apply_expression(column, expression) + else + table.apply_window_function(column, expression, + :sort_keys => @window_sort_keys, + :group_keys => @window_group_keys) + end + ensure + expression.close + end + end + + private + def parse_type(type_raw) + return nil if type_raw.nil? + + type = Context.instance[type_raw] + if type.nil? + message = "#{error_message_tag} unknown type: <#{type_raw}>" + raise InvalidArgument, message + end + + case type + when Type, Table + type + else + message = "#{error_message_tag} invalid type: #{type.grn_inspect}" + raise InvalidArgument, message + end + end + + def parse_flags(flags_raw) + Column.parse_flags(error_message_tag, flags_raw) + end + + def error_message_tag + "[logical_select][columns][#{@stage}][#{@label}]" + end + end + + class PlainDrilldownExecuteContext + include KeysParsable + include Calculatable + + attr_reader :keys + attr_reader :offset + attr_reader :limit + attr_reader :sort_keys + attr_reader :output_columns + attr_reader :calc_target_name + attr_reader :calc_types + attr_reader :filter + attr_reader :result_sets + attr_reader :unsorted_result_sets + attr_reader :temporary_tables + attr_reader :expressions + def initialize(input) + @input = input + @keys = parse_keys(@input[:drilldown]) + @offset = (@input[:drilldown_offset] || 0).to_i + @limit = (@input[:drilldown_limit] || 10).to_i + @sort_keys = parse_keys(@input[:drilldown_sort_keys] || + @input[:drilldown_sortby]) + @output_columns = @input[:drilldown_output_columns] + @output_columns ||= "_key, _nsubrecs" + @calc_target_name = @input[:drilldown_calc_target] + @calc_types = parse_calc_types(@input[:drilldown_calc_types]) + @filter = @input[:drilldown_filter] + + @result_sets = [] + @unsorted_result_sets = [] + + @temporary_tables = [] + + @expressions = [] + end + + def close + @result_sets.each do |result_set| + result_set.close + end + @unsorted_result_sets.each do |result_set| + result_set.close + end + + @temporary_tables.each do |table| + table.close + end + + @expressions.each do |expression| + expression.close + end + end + + def have_keys? + @keys.size > 0 + end + + def n_result_sets + @result_sets.size + end + end + + class LabeledDrilldowns + include Enumerable + include TSort + + class << self + def parse(input) + parser = LabeledArgumentParser.new(input) + drilldowns = parser.parse(/drilldowns?/) + + contexts = {} + drilldowns.each do |label, parameters| + next if parameters["keys"].nil? + context = LabeledDrilldownExecuteContext.new(label, parameters) + contexts[label] = context + end + + new(contexts) + end + end + + def initialize(contexts) + @contexts = contexts + @dependencies = {} + @contexts.each do |label, context| + if context.table + depended_context = @contexts[context.table] + if depended_context.nil? + raise "Unknown drilldown: <#{context.table}>" + end + @dependencies[label] = [depended_context] + else + @dependencies[label] = [] + end + end + end + + def close + @contexts.each_value do |context| + context.close + end + end + + def [](label) + @contexts[label] + end + + def have_keys? + not @contexts.empty? + end + + def n_result_sets + @contexts.size + end + + def each(&block) + @contexts.each_value(&block) + end + + def tsort_each_node(&block) + @contexts.each_value(&block) + end + + def tsort_each_child(context, &block) + @dependencies[context.label].each(&block) + end + end + + class LabeledDrilldownExecuteContext + include KeysParsable + include Calculatable + + attr_reader :label + attr_reader :keys + attr_reader :offset + attr_reader :limit + attr_reader :sort_keys + attr_reader :output_columns + attr_reader :calc_target_name + attr_reader :calc_types + attr_reader :filter + attr_reader :table + attr_reader :dynamic_columns + attr_accessor :result_set + attr_accessor :unsorted_result_set + attr_reader :temporary_tables + attr_reader :expressions + def initialize(label, parameters) + @label = label + @keys = parse_keys(parameters["keys"]) + @offset = (parameters["offset"] || 0).to_i + @limit = (parameters["limit"] || 10).to_i + @sort_keys = parse_keys(parameters["sort_keys"] || + parameters["sortby"]) + @output_columns = parameters["output_columns"] + @output_columns ||= "_key, _nsubrecs" + @calc_target_name = parameters["calc_target"] + @calc_types = parse_calc_types(parameters["calc_types"]) + @filter = parameters["filter"] + @table = parameters["table"] + + @dynamic_columns = DynamicColumns.parse(parameters) + + @result_set = nil + @unsorted_result_set = nil + + @temporary_tables = [] + + @expressions = [] + end + + def close + @result_set.close if @result_set + @unsorted_result_set.close if @unsorted_result_set + + @dynamic_columns.close + + @temporary_tables.each do |table| + table.close + end + + @expressions.each do |expression| + expression.close + end + end + + def need_command_version2? + /[.\[]/ === @output_columns + end + + def output_columns_v2 + columns = @output_columns.strip.split(/ *, */) + converted_columns = columns.collect do |column| + match_data = /\A_value\.(.+)\z/.match(column) + if match_data.nil? + column + else + nth_key = keys.index(match_data[1]) + if nth_key + "_key[#{nth_key}]" + else + column + end + end + end + converted_columns.join(",") + end + end + + class Executor + def initialize(context) + @context = context + end + + def execute + execute_search + if @context.plain_drilldown.have_keys? + execute_plain_drilldown + elsif @context.labeled_drilldowns.have_keys? + execute_labeled_drilldowns + end + end + + private + def execute_search + first_shard = nil + enumerator = @context.enumerator + enumerator.each do |shard, shard_range| + first_shard ||= shard + shard_executor = ShardExecutor.new(@context, shard, shard_range) + shard_executor.execute + end + if first_shard.nil? + message = + "[logical_select] no shard exists: " + + "logical_table: <#{enumerator.logical_table}>: " + + "shard_key: <#{enumerator.shard_key_name}>" + raise InvalidArgument, message + end + if @context.result_sets.empty? + result_set = HashTable.create(:flags => ObjectFlags::WITH_SUBREC, + :key_type => first_shard.table) + @context.dynamic_columns.each_initial do |dynamic_column| + dynamic_column.apply(result_set) + end + @context.dynamic_columns.each_filtered do |dynamic_column| + dynamic_column.apply(result_set) + end + @context.result_sets << result_set + end + end + + def execute_plain_drilldown + drilldown = @context.plain_drilldown + group_result = TableGroupResult.new + begin + group_result.key_begin = 0 + group_result.key_end = 0 + group_result.limit = 1 + group_result.flags = drilldown.calc_types + drilldown.keys.each do |key| + @context.result_sets.each do |result_set| + with_calc_target(group_result, + drilldown.calc_target(result_set)) do + result_set.group([key], group_result) + end + end + result_set = group_result.table + result_set = apply_drilldown_filter(drilldown, result_set) + if drilldown.sort_keys.empty? + drilldown.result_sets << result_set + else + drilldown.result_sets << result_set.sort(drilldown.sort_keys) + drilldown.unsorted_result_sets << result_set + end + group_result.table = nil + end + ensure + group_result.close + end + end + + def execute_labeled_drilldowns + drilldowns = @context.labeled_drilldowns + + drilldowns.tsort_each do |drilldown| + group_result = TableGroupResult.new + keys = drilldown.keys + begin + group_result.key_begin = 0 + group_result.key_end = keys.size - 1 + if keys.size > 1 + group_result.max_n_sub_records = 1 + end + group_result.limit = 1 + group_result.flags = drilldown.calc_types + if drilldown.table + target_table = drilldowns[drilldown.table].result_set + with_calc_target(group_result, + drilldown.calc_target(target_table)) do + target_table.group(keys, group_result) + end + else + @context.result_sets.each do |result_set| + with_calc_target(group_result, + drilldown.calc_target(result_set)) do + result_set.group(keys, group_result) + end + end + end + result_set = group_result.table + drilldown.dynamic_columns.each_initial do |dynamic_column| + dynamic_column.apply(result_set) + end + result_set = apply_drilldown_filter(drilldown, result_set) + if drilldown.sort_keys.empty? + drilldown.result_set = result_set + else + drilldown.result_set = result_set.sort(drilldown.sort_keys) + drilldown.unsorted_result_set = result_set + end + group_result.table = nil + ensure + group_result.close + end + end + end + + def with_calc_target(group_result, calc_target) + group_result.calc_target = calc_target + begin + yield + ensure + calc_target.close if calc_target + group_result.calc_target = nil + end + end + + def apply_drilldown_filter(drilldown, result_set) + filter = drilldown.filter + return result_set if filter.nil? + + expression = Expression.create(result_set) + drilldown.expressions << expression + expression.parse(filter) + filtered_result_set = result_set.select(expression) + drilldown.temporary_tables << result_set + filtered_result_set + end + end + + class ShardExecutor + def initialize(context, shard, shard_range) + @context = context + @shard = shard + @shard_range = shard_range + + @target_table = @shard.table + + @match_columns = @context.match_columns + @query = @context.query + @filter = @context.filter + @sort_keys = @context.sort_keys + @result_sets = @context.result_sets + @unsorted_result_sets = @context.unsorted_result_sets + + @target_range = @context.enumerator.target_range + + @cover_type = @target_range.cover_type(@shard_range) + end + + def execute + return if @cover_type == :none + return if @target_table.empty? + + shard_key = @shard.key + if shard_key.nil? + message = "[logical_select] shard_key doesn't exist: " + + "<#{@shard.key_name}>" + raise InvalidArgument, message + end + + @context.dynamic_columns.each_initial do |dynamic_column| + if @target_table == @shard.table + @target_table = create_all_match_table(@target_table) + @context.temporary_tables << @target_table + end + dynamic_column.apply(@target_table) + end + + create_expression_builder(shard_key) do |expression_builder| + case @cover_type + when :all + filter_shard_all(expression_builder) + when :partial_min + filter_table do |expression| + expression_builder.build_partial_min(expression) + end + when :partial_max + filter_table do |expression| + expression_builder.build_partial_max(expression) + end + when :partial_min_and_max + filter_table do |expression| + expression_builder.build_partial_min_and_max(expression) + end + end + end + end + + private + def filter_shard_all(expression_builder) + if @query.nil? and @filter.nil? + add_result_set(@target_table, nil) + @context.temporary_tables.delete(@target_table) + else + filter_table do |expression| + expression_builder.build_all(expression) + end + end + end + + def create_expression(table) + expression = Expression.create(table) + @context.expressions << expression + expression + end + + def create_expression_builder(shard_key) + expression_builder = RangeExpressionBuilder.new(shard_key, + @target_range) + expression_builder.match_columns = @match_columns + expression_builder.query = @query + expression_builder.filter = @filter + begin + yield(expression_builder) + ensure + expression = expression_builder.match_columns_expression + @context.expressions << expression if expression + end + end + + def filter_table + table = @target_table + expression = create_expression(table) + yield(expression) + add_result_set(table.select(expression), expression) + end + + def add_result_set(result_set, condition) + if result_set.empty? + result_set.close + return + end + + @context.dynamic_columns.each_filtered do |dynamic_column| + if result_set == @shard.table + @context.temporary_tables << result_set + result_set = create_all_match_table(result_set) + end + dynamic_column.apply(result_set, condition) + end + + if @sort_keys.empty? + @result_sets << result_set + else + @unsorted_result_sets << result_set + sorted_result_set = result_set.sort(@sort_keys) + @result_sets << sorted_result_set + end + end + + def create_all_match_table(table) + expression = Expression.create(table) + begin + expression.append_constant(true, Operator::PUSH, 1) + table.select(expression) + ensure + expression.close + end + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_shard_list.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_shard_list.rb new file mode 100644 index 00000000..b8ef3f76 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_shard_list.rb @@ -0,0 +1,28 @@ +module Groonga + module Sharding + class LogicalShardListCommand < Command + register("logical_shard_list", + [ + "logical_table", + ]) + + def run_body(input) + enumerator = LogicalEnumerator.new("logical_shard_list", + input, + :require_shard_key => false) + shard_names = enumerator.collect do |current_shard, shard_range| + current_shard.table_name + end + + writer.array("shards", shard_names.size) do + shard_names.each do |shard_name| + writer.map("shard", 1) do + writer.write("name") + writer.write(shard_name) + end + end + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/logical_table_remove.rb b/storage/mroonga/vendor/groonga/plugins/sharding/logical_table_remove.rb new file mode 100644 index 00000000..3353d6c3 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/logical_table_remove.rb @@ -0,0 +1,345 @@ +module Groonga + module Sharding + class LogicalTableRemoveCommand < Command + register("logical_table_remove", + [ + "logical_table", + "shard_key", + "min", + "min_border", + "max", + "max_border", + "dependent", + "force", + ]) + + def run_body(input) + @dependent = (input[:dependent] == "yes") + @force = (input[:force] == "yes") + + enumerator = LogicalEnumerator.new("logical_table_remove", input) + + success = true + enumerator.each do |shard, shard_range| + remove_shard(shard, shard_range, enumerator.target_range) + end + writer.write(success) + end + + private + def remove_shard(shard, shard_range, target_range) + cover_type = target_range.cover_type(shard_range) + return if cover_type == :none + + shard_key = shard.key + if shard_key.nil? + if @force + context.clear_error + else + message = + "[logical_table_remove] shard_key doesn't exist: " + + "<#{shard.key_name}>" + raise InvalidArgument, message + end + end + table = shard.table + + if cover_type == :all or ((table.nil? or shard_key.nil?) and @force) + remove_table(shard, table) + return + end + + expression_builder = RangeExpressionBuilder.new(shard_key, + target_range) + case cover_type + when :partial_min + remove_records(table) do |expression| + expression_builder.build_partial_min(expression) + end + remove_table(shard, table) if table.empty? + when :partial_max + remove_records(table) do |expression| + expression_builder.build_partial_max(expression) + end + remove_table(shard, table) if table.empty? + when :partial_min_and_max + remove_records(table) do |expression| + expression_builder.build_partial_min_and_max(expression) + end + remove_table(shard, table) if table.empty? + end + end + + def collect_referenced_table_ids_from_index_ids(index_ids, + referenced_table_ids) + database = context.database + index_ids.each do |index_id| + index = context[index_id] + if index.nil? + context.clear_error + index_name = database[index_id] + lexicon_name = index_name.split(".", 2)[0] + lexicon_id = database[lexicon_name] + referenced_table_ids << lexicon_id if lexicon_id + else + referenced_table_ids << index.domain_id + end + end + end + + def collect_referenced_table_ids_from_column_name(column_name, + referenced_table_ids) + database = context.database + column_id = database[column_name] + database.each_raw do |id, cursor| + next if ID.builtin?(id) + next if id == column_id + + context.open_temporary(id) do |object| + if object.nil? + context.clear_error + next + end + + case object + when IndexColumn + if object.source_ids.include?(column_id) + collect_referenced_table_ids_from_index_ids([id], + referenced_table_ids) + end + end + end + end + end + + def collect_referenced_table_ids_from_column(column, + referenced_table_ids) + range = column.range + case range + when nil + context.clear_error + when Table + referenced_table_ids << range.id + collect_referenced_table_ids_from_index_ids(range.index_ids, + referenced_table_ids) + end + collect_referenced_table_ids_from_index_ids(column.index_ids, + referenced_table_ids) + end + + def collect_referenced_table_ids_from_column_names(column_names) + referenced_table_ids = [] + column_names.each do |column_name| + column = context[column_name] + if column.nil? + context.clear_error + collect_referenced_table_ids_from_column_name(column_name, + referenced_table_ids) + else + collect_referenced_table_ids_from_column(column, + referenced_table_ids) + end + end + referenced_table_ids + end + + def collect_referenced_table_ids(shard, table) + return [] unless @dependent + + column_names = nil + if table + begin + column_names = table.columns.collect(&:name) + rescue + context.clear_error + end + end + if column_names.nil? + prefix = "#{shard.table_name}." + column_names = [] + context.database.each_name(:prefix => prefix) do |column_name| + column_names << column_name + end + end + + collect_referenced_table_ids_from_column_names(column_names) + end + + def remove_table(shard, table) + if table.nil? + unless @force + if context.rc == Context::RC::SUCCESS.to_i + error_class = InvalidArgument + else + rc = Context::RC.find(context.rc) + error_class = rc.error_class + end + message = "[logical_table_remove] table is broken: " + + "<#{shard.table_name}>: #{context.error_message}" + raise error_class, message + end + context.clear_error + end + + referenced_table_ids = collect_referenced_table_ids(shard, table) + + if table.nil? + remove_table_force(shard.table_name) + else + options = {:dependent => @dependent} + if @force + begin + table.remove(options) + rescue + context.clear_error + table.close + remove_table_force(shard.table_name) + end + else + table.remove(options) + end + end + + remove_referenced_tables(shard, referenced_table_ids) + end + + def remove_table_force(table_name) + database = context.database + + prefix = "#{table_name}." + database.each_raw(:prefix => prefix) do |id, cursor| + column = context[id] + if column.nil? + context.clear_error + column_name = cursor.key + remove_column_force(column_name) + table = context[table_name] + if table.nil? + context.clear_error + else + table.close + end + else + remove_column(column) + end + end + + table_id = database[table_name] + return if table_id.nil? + + database.each_raw do |id, cursor| + next if ID.builtin?(id) + next if id == table_id + + context.open_temporary(id) do |object| + if object.nil? + context.clear_error + next + end + + case object + when Table + if object.domain_id == table_id + begin + object.remove(:dependent => @dependent) + rescue + context.clear_error + reference_table_name = object.name + object.close + remove_table_force(reference_table_name) + end + end + when Column + if object.range_id == table_id + remove_column(object) + end + end + end + end + + Object.remove_force(table_name) + end + + def remove_column(column) + begin + column.remove(:dependent => @dependent) + rescue + context.clear_error + column_name = column.name + column.close + remove_column_force(column_name) + end + end + + def remove_column_force(column_name) + database = context.database + + column_id = database[column_name] + + column = context[column_id] + if column.nil? + context.clear_error + else + column.index_ids.each do |id| + index_column = context[id] + if index_column.nil? + context.clear_error + index_column_name = database[id] + remove_column_force(index_column_name) + else + remove_column(index_column) + end + end + column.close + end + + Object.remove_force(column_name) + end + + def remove_referenced_tables(shard, referenced_table_ids) + return if referenced_table_ids.empty? + + database = context.database + shard_suffix = shard.range_data.to_suffix + referenced_table_ids.uniq.each do |referenced_table_id| + referenced_table_name = database[referenced_table_id] + next if referenced_table_name.nil? + next unless referenced_table_name.end_with?(shard_suffix) + + referenced_table = context[referenced_table_id] + if referenced_table.nil? + context.clear_error + if @force + Object.remove_force(referenced_table_name) + end + next + end + + if @force + begin + referenced_table.remove(:dependent => @dependent) + rescue + context.clear_error + referenced_table.close + remove_table_force(referenced_table_name) + end + else + referenced_table.remove(:dependent => @dependent) + end + end + end + + def remove_records(table) + expression = nil + + begin + expression = Expression.create(table) + yield(expression) + table.delete(:expression => expression) + ensure + expression.close if expression + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/parameters.rb b/storage/mroonga/vendor/groonga/plugins/sharding/parameters.rb new file mode 100644 index 00000000..b09a9d6c --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/parameters.rb @@ -0,0 +1,10 @@ +module Groonga + module Sharding + module Parameters + @range_index = :auto + class << self + attr_accessor :range_index + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/range_expression_builder.rb b/storage/mroonga/vendor/groonga/plugins/sharding/range_expression_builder.rb new file mode 100644 index 00000000..cc80735d --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/range_expression_builder.rb @@ -0,0 +1,88 @@ +module Groonga + module Sharding + class RangeExpressionBuilder + attr_reader :match_columns_expression + + attr_writer :match_columns + attr_writer :query + attr_writer :filter + + def initialize(key, target_range) + @key = key + @target_range = target_range + @match_columns_expression = nil + @match_columns = nil + @query = nil + @filter = nil + end + + def build_all(expression) + build_condition(expression) + end + + def build_partial_min(expression) + expression.append_object(@key, Operator::PUSH, 1) + expression.append_operator(Operator::GET_VALUE, 1) + expression.append_constant(@target_range.min, Operator::PUSH, 1) + if @target_range.min_border == :include + expression.append_operator(Operator::GREATER_EQUAL, 2) + else + expression.append_operator(Operator::GREATER, 2) + end + build_condition(expression) + end + + def build_partial_max(expression) + expression.append_object(@key, Operator::PUSH, 1) + expression.append_operator(Operator::GET_VALUE, 1) + expression.append_constant(@target_range.max, Operator::PUSH, 1) + if @target_range.max_border == :include + expression.append_operator(Operator::LESS_EQUAL, 2) + else + expression.append_operator(Operator::LESS, 2) + end + build_condition(expression) + end + + def build_partial_min_and_max(expression) + between = Groonga::Context.instance["between"] + expression.append_object(between, Operator::PUSH, 1) + expression.append_object(@key, Operator::PUSH, 1) + expression.append_operator(Operator::GET_VALUE, 1) + expression.append_constant(@target_range.min, Operator::PUSH, 1) + expression.append_constant(@target_range.min_border, + Operator::PUSH, 1) + expression.append_constant(@target_range.max, Operator::PUSH, 1) + expression.append_constant(@target_range.max_border, + Operator::PUSH, 1) + expression.append_operator(Operator::CALL, 5) + build_condition(expression) + end + + private + def build_condition(expression) + if @query + is_empty = expression.empty? + if @match_columns + table = Context.instance[expression[0].domain] + @match_columns_expression = Expression.create(table) + @match_columns_expression.parse(@match_columns) + end + flags = Expression::SYNTAX_QUERY | + Expression::ALLOW_PRAGMA | + Expression::ALLOW_COLUMN + expression.parse(@query, + default_column: @match_columns_expression, + flags: flags) + expression.append_operator(Operator::AND, 2) unless is_empty + end + + if @filter + is_empty = expression.empty? + expression.parse(@filter) + expression.append_operator(Operator::AND, 2) unless is_empty + end + end + end + end +end diff --git a/storage/mroonga/vendor/groonga/plugins/sharding/sources.am b/storage/mroonga/vendor/groonga/plugins/sharding/sources.am new file mode 100644 index 00000000..df2b6d02 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/sharding/sources.am @@ -0,0 +1,10 @@ +sharding_scripts = \ + logical_count.rb \ + logical_enumerator.rb \ + logical_parameters.rb \ + logical_range_filter.rb \ + logical_select.rb \ + logical_shard_list.rb \ + logical_table_remove.rb \ + parameters.rb \ + range_expression_builder.rb diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/suggest/CMakeLists.txt new file mode 100644 index 00000000..8b287e65 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/CMakeLists.txt @@ -0,0 +1,36 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ${MRUBY_INCLUDE_DIRS} + ${MESSAGE_PACK_INCLUDE_DIRS}) + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/sources.am SUGGEST_SOURCES) +set_source_files_properties(${SUGGEST_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(suggest STATIC ${SUGGEST_SOURCES}) + set_target_properties( + suggest + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(suggest MODULE ${SUGGEST_SOURCES}) + set_target_properties(suggest PROPERTIES PREFIX "") + install(TARGETS suggest DESTINATION "${GRN_RELATIVE_PLUGINS_DIR}/suggest") +endif() +target_link_libraries(suggest libgroonga) diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/Makefile.am b/storage/mroonga/vendor/groonga/plugins/suggest/Makefile.am new file mode 100644 index 00000000..7f321b6c --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/Makefile.am @@ -0,0 +1,24 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CFLAGS = \ + $(MESSAGE_PACK_CFLAGS) \ + $(MRUBY_CFLAGS) + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la \ + $(MESSAGE_PACK_LIBS) + +suggest_plugins_LTLIBRARIES = suggest.la + +include sources.am diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/sources.am b/storage/mroonga/vendor/groonga/plugins/suggest/sources.am new file mode 100644 index 00000000..798a431a --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/sources.am @@ -0,0 +1,2 @@ +suggest_la_SOURCES = \ + suggest.c diff --git a/storage/mroonga/vendor/groonga/plugins/suggest/suggest.c b/storage/mroonga/vendor/groonga/plugins/suggest/suggest.c new file mode 100644 index 00000000..7f64f3c1 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/suggest/suggest.c @@ -0,0 +1,1035 @@ +/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ +/* Copyright(C) 2010-2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG suggest_suggest +#endif + +#include <string.h> + +#include "grn_ctx.h" +#include "grn_db.h" +#include "grn_ii.h" +#include "grn_token_cursor.h" +#include "grn_output.h" +#include <groonga/plugin.h> + +#define VAR GRN_PROC_GET_VAR_BY_OFFSET +#define CONST_STR_LEN(x) x, x ? sizeof(x) - 1 : 0 +#define TEXT_VALUE_LEN(x) GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x) + +#define MIN_LEARN_DISTANCE (60 * GRN_TIME_USEC_PER_SEC) + +#define COMPLETE 1 +#define CORRECT 2 +#define SUGGEST 4 + +typedef enum { + GRN_SUGGEST_SEARCH_YES, + GRN_SUGGEST_SEARCH_NO, + GRN_SUGGEST_SEARCH_AUTO +} grn_suggest_search_mode; + +typedef struct { + grn_obj *post_event; + grn_obj *post_type; + grn_obj *post_item; + grn_obj *seq; + grn_obj *post_time; + grn_obj *pairs; + + int learn_distance_in_seconds; + + grn_id post_event_id; + grn_id post_type_id; + grn_id post_item_id; + grn_id seq_id; + int64_t post_time_value; + + grn_obj *seqs; + grn_obj *seqs_events; + grn_obj *events; + grn_obj *events_item; + grn_obj *events_type; + grn_obj *events_time; + grn_obj *event_types; + grn_obj *items; + grn_obj *items_freq; + grn_obj *items_freq2; + grn_obj *items_last; + grn_obj *pairs_pre; + grn_obj *pairs_post; + grn_obj *pairs_freq0; + grn_obj *pairs_freq1; + grn_obj *pairs_freq2; + + grn_obj dataset_name; + + grn_obj *configuration; + + grn_obj weight; + grn_obj pre_events; + + uint64_t key_prefix; + grn_obj pre_item; +} grn_suggest_learner; + +static int +grn_parse_suggest_types(grn_obj *text) +{ + const char *nptr = GRN_TEXT_VALUE(text); + const char *end = GRN_BULK_CURR(text); + int types = 0; + while (nptr < end) { + if (*nptr == '|') { + nptr += 1; + continue; + } + { + const char string[] = "complete"; + size_t length = sizeof(string) - 1; + if (nptr + length <= end && memcmp(nptr, string, length) == 0) { + types |= COMPLETE; + nptr += length; + continue; + } + } + { + const char string[] = "correct"; + size_t length = sizeof(string) - 1; + if (nptr + length <= end && memcmp(nptr, string, length) == 0) { + types |= CORRECT; + nptr += length; + continue; + } + } + { + const char string[] = "suggest"; + size_t length = sizeof(string) - 1; + if (nptr + length <= end && memcmp(nptr, string, length) == 0) { + types |= SUGGEST; + nptr += length; + continue; + } + } + break; + } + return types; +} + +static double +cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id id, + grn_obj *res, int query_type, int frequency_threshold, + double conditional_probability_threshold) +{ + double max_score = 0.0; + if (id) { + grn_ii_cursor *c; + grn_obj *co = grn_obj_column(ctx, items, CONST_STR_LEN("co")); + grn_obj *pairs = grn_ctx_at(ctx, grn_obj_get_range(ctx, co)); + grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq")); + grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2")); + grn_obj *pairs_freq, *pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post")); + switch (query_type) { + case COMPLETE : + pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0")); + break; + case CORRECT : + pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1")); + break; + case SUGGEST : + pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2")); + break; + default : + return max_score; + } + if ((c = grn_ii_cursor_open(ctx, (grn_ii *)co, id, GRN_ID_NIL, GRN_ID_MAX, + ((grn_ii *)co)->n_elements - 1, 0))) { + grn_posting *p; + grn_obj post, pair_freq, item_freq, item_freq2, item_boost; + GRN_RECORD_INIT(&post, 0, grn_obj_id(ctx, items)); + GRN_INT32_INIT(&pair_freq, 0); + GRN_INT32_INIT(&item_freq, 0); + GRN_INT32_INIT(&item_freq2, 0); + GRN_INT32_INIT(&item_boost, 0); + while ((p = grn_ii_cursor_next(ctx, c))) { + grn_id post_id; + int pfreq, ifreq, ifreq2, boost; + double conditional_probability; + GRN_BULK_REWIND(&post); + GRN_BULK_REWIND(&pair_freq); + GRN_BULK_REWIND(&item_freq); + GRN_BULK_REWIND(&item_freq2); + GRN_BULK_REWIND(&item_boost); + grn_obj_get_value(ctx, pairs_post, p->rid, &post); + grn_obj_get_value(ctx, pairs_freq, p->rid, &pair_freq); + post_id = GRN_RECORD_VALUE(&post); + grn_obj_get_value(ctx, items_freq, post_id, &item_freq); + grn_obj_get_value(ctx, items_freq2, post_id, &item_freq2); + grn_obj_get_value(ctx, items_boost, post_id, &item_boost); + pfreq = GRN_INT32_VALUE(&pair_freq); + ifreq = GRN_INT32_VALUE(&item_freq); + ifreq2 = GRN_INT32_VALUE(&item_freq2); + if (ifreq2 > 0) { + conditional_probability = (double)pfreq / (double)ifreq2; + } else { + conditional_probability = 0.0; + } + boost = GRN_INT32_VALUE(&item_boost); + if (pfreq >= frequency_threshold && ifreq >= frequency_threshold && + conditional_probability >= conditional_probability_threshold && + boost >= 0) { + grn_rset_recinfo *ri; + void *value; + double score = pfreq; + int added; + if (max_score < score + boost) { max_score = score + boost; } + /* put any formula if desired */ + if (grn_hash_add(ctx, (grn_hash *)res, + &post_id, sizeof(grn_id), &value, &added)) { + ri = value; + ri->score += score; + if (added) { + ri->score += boost; + } + } + } + } + GRN_OBJ_FIN(ctx, &post); + GRN_OBJ_FIN(ctx, &pair_freq); + GRN_OBJ_FIN(ctx, &item_freq); + GRN_OBJ_FIN(ctx, &item_freq2); + GRN_OBJ_FIN(ctx, &item_boost); + grn_ii_cursor_close(ctx, c); + } + } + return max_score; +} + +#define DEFAULT_LIMIT 10 +#define DEFAULT_SORTBY "-_score" +#define DEFAULT_OUTPUT_COLUMNS "_key,_score" +#define DEFAULT_FREQUENCY_THRESHOLD 100 +#define DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD 0.2 + +static void +output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid, + grn_obj *sortby, grn_obj *output_columns, int offset, int limit) +{ + grn_obj *sorted; + if ((sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, res))) { + uint32_t nkeys; + grn_obj_format format; + grn_table_sort_key *keys; + const char *sortby_val = GRN_TEXT_VALUE(sortby); + unsigned int sortby_len = GRN_TEXT_LEN(sortby); + const char *oc_val = GRN_TEXT_VALUE(output_columns); + unsigned int oc_len = GRN_TEXT_LEN(output_columns); + if (!sortby_val || !sortby_len) { + sortby_val = DEFAULT_SORTBY; + sortby_len = sizeof(DEFAULT_SORTBY) - 1; + } + if (!oc_val || !oc_len) { + oc_val = DEFAULT_OUTPUT_COLUMNS; + oc_len = sizeof(DEFAULT_OUTPUT_COLUMNS) - 1; + } + if ((keys = grn_table_sort_key_from_str(ctx, sortby_val, sortby_len, res, &nkeys))) { + grn_table_sort(ctx, res, offset, limit, sorted, keys, nkeys); + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, + ":", "sort(%d)", limit); + GRN_OBJ_FORMAT_INIT(&format, grn_table_size(ctx, res), 0, limit, offset); + format.flags = + GRN_OBJ_FORMAT_WITH_COLUMN_NAMES| + GRN_OBJ_FORMAT_XML_ELEMENT_RESULTSET; + grn_obj_columns(ctx, sorted, oc_val, oc_len, &format.columns); + GRN_OUTPUT_OBJ(sorted, &format); + GRN_OBJ_FORMAT_FIN(ctx, &format); + grn_table_sort_key_close(ctx, keys, nkeys); + } + grn_obj_unlink(ctx, sorted); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary sort table."); + } +} + +static inline void +complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int frequency_threshold, + grn_obj *items_freq, grn_obj *items_boost, + grn_obj *item_freq, grn_obj *item_boost) +{ + GRN_BULK_REWIND(item_freq); + GRN_BULK_REWIND(item_boost); + grn_obj_get_value(ctx, items_freq, id, item_freq); + grn_obj_get_value(ctx, items_boost, id, item_boost); + if (GRN_INT32_VALUE(item_boost) >= 0) { + double score; + score = 1 + + GRN_INT32_VALUE(item_freq) + + GRN_INT32_VALUE(item_boost); + if (score >= frequency_threshold) { + void *value; + if (grn_hash_add(ctx, (grn_hash *)res, &id, sizeof(grn_id), + &value, NULL)) { + grn_rset_recinfo *ri; + ri = value; + ri->score += score; + } + } + } +} + +static void +complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, + grn_obj *query, grn_obj *sortby, + grn_obj *output_columns, int offset, int limit, + int frequency_threshold, double conditional_probability_threshold, + grn_suggest_search_mode prefix_search_mode) +{ + grn_obj *res; + grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq")); + grn_obj item_freq, item_boost; + GRN_INT32_INIT(&item_freq, 0); + GRN_INT32_INIT(&item_boost, 0); + if ((res = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { + grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); + if (GRN_TEXT_LEN(query)) { + grn_table_cursor *cur; + /* RK search + prefix search */ + grn_obj *index; + /* FIXME: support index selection */ + if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) { + if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain), + GRN_TEXT_VALUE(query), + GRN_TEXT_LEN(query), + NULL, 0, 0, -1, + GRN_CURSOR_PREFIX|GRN_CURSOR_RK))) { + grn_id id; + while ((id = grn_table_cursor_next(ctx, cur))) { + grn_ii_cursor *icur; + if ((icur = grn_ii_cursor_open(ctx, (grn_ii *)index, id, + GRN_ID_NIL, GRN_ID_MAX, 1, 0))) { + grn_posting *p; + while ((p = grn_ii_cursor_next(ctx, icur))) { + complete_add_item(ctx, p->rid, res, frequency_threshold, + items_freq, items_boost, + &item_freq, &item_boost); + } + grn_ii_cursor_close(ctx, icur); + } + } + grn_table_cursor_close(ctx, cur); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot open cursor for prefix RK search."); + } + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot find index for prefix RK search."); + } + cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE, + frequency_threshold, + conditional_probability_threshold); + if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) || + (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO && + !grn_table_size(ctx, res))) && + (cur = grn_table_cursor_open(ctx, items, + GRN_TEXT_VALUE(query), + GRN_TEXT_LEN(query), + NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) { + grn_id id; + while ((id = grn_table_cursor_next(ctx, cur))) { + complete_add_item(ctx, id, res, frequency_threshold, + items_freq, items_boost, &item_freq, &item_boost); + } + grn_table_cursor_close(ctx, cur); + } + } + output(ctx, items, res, tid, sortby, output_columns, offset, limit); + grn_obj_close(ctx, res); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table."); + } + GRN_OBJ_FIN(ctx, &item_boost); + GRN_OBJ_FIN(ctx, &item_freq); +} + +static void +correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, + grn_obj *query, grn_obj *sortby, + grn_obj *output_columns, int offset, int limit, + int frequency_threshold, double conditional_probability_threshold, + grn_suggest_search_mode similar_search_mode) +{ + grn_obj *res; + grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2")); + grn_obj item_freq2, item_boost; + GRN_INT32_INIT(&item_freq2, 0); + GRN_INT32_INIT(&item_boost, 0); + if ((res = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { + grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); + double max_score; + max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT, + frequency_threshold, + conditional_probability_threshold); + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SCORE, + ":", "cooccur(%f)", max_score); + if (GRN_TEXT_LEN(query) && + ((similar_search_mode == GRN_SUGGEST_SEARCH_YES) || + (similar_search_mode == GRN_SUGGEST_SEARCH_AUTO && + max_score < frequency_threshold))) { + grn_obj *key, *index; + if ((key = grn_obj_column(ctx, items, + GRN_COLUMN_NAME_KEY, + GRN_COLUMN_NAME_KEY_LEN))) { + if (grn_column_index(ctx, key, GRN_OP_MATCH, &index, 1, NULL)) { + grn_select_optarg optarg; + memset(&optarg, 0, sizeof(grn_select_optarg)); + optarg.mode = GRN_OP_SIMILAR; + optarg.similarity_threshold = 0; + optarg.max_size = 2; + grn_ii_select(ctx, (grn_ii *)index, TEXT_VALUE_LEN(query), + (grn_hash *)res, GRN_OP_OR, &optarg); + grn_obj_unlink(ctx, index); + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, + ":", "similar(%d)", grn_table_size(ctx, res)); + { + grn_hash_cursor *hc = grn_hash_cursor_open(ctx, (grn_hash *)res, NULL, + 0, NULL, 0, 0, -1, 0); + if (hc) { + while (grn_hash_cursor_next(ctx, hc)) { + void *key, *value; + if (grn_hash_cursor_get_key_value(ctx, hc, &key, NULL, &value)) { + grn_id *rp; + rp = key; + GRN_BULK_REWIND(&item_freq2); + GRN_BULK_REWIND(&item_boost); + grn_obj_get_value(ctx, items_freq2, *rp, &item_freq2); + grn_obj_get_value(ctx, items_boost, *rp, &item_boost); + if (GRN_INT32_VALUE(&item_boost) >= 0) { + double score; + grn_rset_recinfo *ri; + score = 1 + + (GRN_INT32_VALUE(&item_freq2) >> 4) + + GRN_INT32_VALUE(&item_boost); + ri = value; + ri->score += score; + if (score >= frequency_threshold) { continue; } + } + /* score < frequency_threshold || item_boost < 0 */ + grn_hash_cursor_delete(ctx, hc, NULL); + } + } + grn_hash_cursor_close(ctx, hc); + } + } + GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, + ":", "filter(%d)", grn_table_size(ctx, res)); + { + /* exec _score -= edit_distance(_key, "query string") for all records */ + grn_obj *var; + grn_obj *expr; + + GRN_EXPR_CREATE_FOR_QUERY(ctx, res, expr, var); + if (expr) { + grn_table_cursor *tc; + grn_obj *score = grn_obj_column(ctx, res, + GRN_COLUMN_NAME_SCORE, + GRN_COLUMN_NAME_SCORE_LEN); + grn_obj *key = grn_obj_column(ctx, res, + GRN_COLUMN_NAME_KEY, + GRN_COLUMN_NAME_KEY_LEN); + grn_expr_append_obj(ctx, expr, + score, + GRN_OP_GET_VALUE, 1); + grn_expr_append_obj(ctx, expr, + grn_ctx_get(ctx, CONST_STR_LEN("edit_distance")), + GRN_OP_PUSH, 1); + grn_expr_append_obj(ctx, expr, + key, + GRN_OP_GET_VALUE, 1); + grn_expr_append_const(ctx, expr, query, GRN_OP_PUSH, 1); + grn_expr_append_op(ctx, expr, GRN_OP_CALL, 2); + grn_expr_append_op(ctx, expr, GRN_OP_MINUS_ASSIGN, 2); + + if ((tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, 0))) { + grn_id id; + grn_obj score_value; + GRN_FLOAT_INIT(&score_value, 0); + while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { + GRN_RECORD_SET(ctx, var, id); + grn_expr_exec(ctx, expr, 0); + GRN_BULK_REWIND(&score_value); + grn_obj_get_value(ctx, score, id, &score_value); + if (GRN_FLOAT_VALUE(&score_value) < frequency_threshold) { + grn_table_cursor_delete(ctx, tc); + } + } + grn_obj_unlink(ctx, &score_value); + grn_table_cursor_close(ctx, tc); + } + grn_obj_unlink(ctx, score); + grn_obj_unlink(ctx, key); + grn_obj_unlink(ctx, expr); + } else { + ERR(GRN_UNKNOWN_ERROR, + "error on building expr. for calicurating edit distance"); + } + } + } + grn_obj_unlink(ctx, key); + } + } + output(ctx, items, res, tid, sortby, output_columns, offset, limit); + grn_obj_close(ctx, res); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table."); + } + GRN_OBJ_FIN(ctx, &item_boost); + GRN_OBJ_FIN(ctx, &item_freq2); +} + +static void +suggest(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, + grn_obj *query, grn_obj *sortby, + grn_obj *output_columns, int offset, int limit, + int frequency_threshold, double conditional_probability_threshold) +{ + grn_obj *res; + if ((res = grn_table_create(ctx, NULL, 0, NULL, + GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { + grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); + cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST, + frequency_threshold, conditional_probability_threshold); + output(ctx, items, res, tid, sortby, output_columns, offset, limit); + grn_obj_close(ctx, res); + } else { + ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table."); + } +} + +static grn_suggest_search_mode +parse_search_mode(grn_ctx *ctx, grn_obj *mode_text) +{ + grn_suggest_search_mode mode; + int mode_length; + + mode_length = GRN_TEXT_LEN(mode_text); + if (mode_length == 3 && + grn_strncasecmp("yes", GRN_TEXT_VALUE(mode_text), 3) == 0) { + mode = GRN_SUGGEST_SEARCH_YES; + } else if (mode_length == 2 && + grn_strncasecmp("no", GRN_TEXT_VALUE(mode_text), 2) == 0) { + mode = GRN_SUGGEST_SEARCH_NO; + } else { + mode = GRN_SUGGEST_SEARCH_AUTO; + } + + return mode; +} + +static grn_obj * +command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_obj *items, *col, *items_boost; + int types; + int offset = 0; + int limit = DEFAULT_LIMIT; + int frequency_threshold = DEFAULT_FREQUENCY_THRESHOLD; + double conditional_probability_threshold = + DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD; + grn_suggest_search_mode prefix_search_mode; + grn_suggest_search_mode similar_search_mode; + + types = grn_parse_suggest_types(VAR(0)); + if (GRN_TEXT_LEN(VAR(6)) > 0) { + offset = grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL); + } + if (GRN_TEXT_LEN(VAR(7)) > 0) { + limit = grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL); + } + if (GRN_TEXT_LEN(VAR(8)) > 0) { + frequency_threshold = grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL); + } + if (GRN_TEXT_LEN(VAR(9)) > 0) { + GRN_TEXT_PUTC(ctx, VAR(9), '\0'); + conditional_probability_threshold = strtod(GRN_TEXT_VALUE(VAR(9)), NULL); + } + + prefix_search_mode = parse_search_mode(ctx, VAR(10)); + similar_search_mode = parse_search_mode(ctx, VAR(11)); + + if ((items = grn_ctx_get(ctx, TEXT_VALUE_LEN(VAR(1))))) { + if ((items_boost = grn_obj_column(ctx, items, CONST_STR_LEN("boost")))) { + int n_outputs = 0; + if (types & COMPLETE) { + n_outputs++; + } + if (types & CORRECT) { + n_outputs++; + } + if (types & SUGGEST) { + n_outputs++; + } + GRN_OUTPUT_MAP_OPEN("RESULT_SET", n_outputs); + + if (types & COMPLETE) { + if ((col = grn_obj_column(ctx, items, TEXT_VALUE_LEN(VAR(2))))) { + GRN_OUTPUT_CSTR("complete"); + complete(ctx, items, items_boost, col, VAR(3), VAR(4), + VAR(5), offset, limit, + frequency_threshold, conditional_probability_threshold, + prefix_search_mode); + } else { + ERR(GRN_INVALID_ARGUMENT, "invalid column."); + } + } + if (types & CORRECT) { + GRN_OUTPUT_CSTR("correct"); + correct(ctx, items, items_boost, VAR(3), VAR(4), + VAR(5), offset, limit, + frequency_threshold, conditional_probability_threshold, + similar_search_mode); + } + if (types & SUGGEST) { + GRN_OUTPUT_CSTR("suggest"); + suggest(ctx, items, items_boost, VAR(3), VAR(4), + VAR(5), offset, limit, + frequency_threshold, conditional_probability_threshold); + } + GRN_OUTPUT_MAP_CLOSE(); + } else { + ERR(GRN_INVALID_ARGUMENT, "nonexistent column: <%.*s.boost>", + (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); + } + grn_obj_unlink(ctx, items); + } else { + ERR(GRN_INVALID_ARGUMENT, "nonexistent table: <%.*s>", + (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); + } + return NULL; +} + +static void +learner_init_values(grn_ctx *ctx, grn_suggest_learner *learner) +{ + learner->post_event_id = GRN_RECORD_VALUE(learner->post_event); + learner->post_type_id = GRN_RECORD_VALUE(learner->post_type); + learner->post_item_id = GRN_RECORD_VALUE(learner->post_item); + learner->seq_id = GRN_RECORD_VALUE(learner->seq); + learner->post_time_value = GRN_TIME_VALUE(learner->post_time); +} + +static void +learner_init(grn_ctx *ctx, grn_suggest_learner *learner, + grn_obj *post_event, grn_obj *post_type, grn_obj *post_item, + grn_obj *seq, grn_obj *post_time, grn_obj *pairs) +{ + learner->post_event = post_event; + learner->post_type = post_type; + learner->post_item = post_item; + learner->seq = seq; + learner->post_time = post_time; + learner->pairs = pairs; + + learner->learn_distance_in_seconds = 0; + + learner_init_values(ctx, learner); +} + +static void +learner_init_columns(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_id events_id, event_types_id; + grn_obj *seqs, *events, *post_item, *items, *pairs; + + learner->seqs = seqs = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(learner->seq)); + learner->seqs_events = grn_obj_column(ctx, seqs, CONST_STR_LEN("events")); + + events_id = grn_obj_get_range(ctx, learner->seqs_events); + learner->events = events = grn_ctx_at(ctx, events_id); + learner->events_item = grn_obj_column(ctx, events, CONST_STR_LEN("item")); + learner->events_type = grn_obj_column(ctx, events, CONST_STR_LEN("type")); + learner->events_time = grn_obj_column(ctx, events, CONST_STR_LEN("time")); + + event_types_id = grn_obj_get_range(ctx, learner->events_type); + learner->event_types = grn_obj_column(ctx, events, CONST_STR_LEN("time")); + + post_item = learner->post_item; + learner->items = items = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(post_item)); + learner->items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq")); + learner->items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2")); + learner->items_last = grn_obj_column(ctx, items, CONST_STR_LEN("last")); + + pairs = learner->pairs; + learner->pairs_pre = grn_obj_column(ctx, pairs, CONST_STR_LEN("pre")); + learner->pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post")); + learner->pairs_freq0 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0")); + learner->pairs_freq1 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1")); + learner->pairs_freq2 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2")); +} + +static void +learner_fin_columns(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_unlink(ctx, learner->seqs); + grn_obj_unlink(ctx, learner->seqs_events); + + grn_obj_unlink(ctx, learner->events); + grn_obj_unlink(ctx, learner->events_item); + grn_obj_unlink(ctx, learner->events_type); + grn_obj_unlink(ctx, learner->events_time); + + grn_obj_unlink(ctx, learner->event_types); + + grn_obj_unlink(ctx, learner->items); + grn_obj_unlink(ctx, learner->items_freq); + grn_obj_unlink(ctx, learner->items_freq2); + grn_obj_unlink(ctx, learner->items_last); + + grn_obj_unlink(ctx, learner->pairs_pre); + grn_obj_unlink(ctx, learner->pairs_post); + grn_obj_unlink(ctx, learner->pairs_freq0); + grn_obj_unlink(ctx, learner->pairs_freq1); + grn_obj_unlink(ctx, learner->pairs_freq2); +} + +static void +learner_init_weight(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj *weight_column = NULL; + unsigned int weight = 1; + + if (learner->configuration) { + weight_column = grn_obj_column(ctx, + learner->configuration, + CONST_STR_LEN("weight")); + } + if (weight_column) { + grn_id id; + id = grn_table_get(ctx, learner->configuration, + GRN_TEXT_VALUE(&(learner->dataset_name)), + GRN_TEXT_LEN(&(learner->dataset_name))); + if (id != GRN_ID_NIL) { + grn_obj weight_value; + GRN_UINT32_INIT(&weight_value, 0); + grn_obj_get_value(ctx, weight_column, id, &weight_value); + weight = GRN_UINT32_VALUE(&weight_value); + GRN_OBJ_FIN(ctx, &weight_value); + } + grn_obj_unlink(ctx, weight_column); + } + + GRN_UINT32_INIT(&(learner->weight), 0); + GRN_UINT32_SET(ctx, &(learner->weight), weight); +} + +static void +learner_init_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner) +{ + char events_name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int events_name_size; + unsigned int events_name_prefix_size; + + events_name_size = grn_obj_name(ctx, learner->events, + events_name, GRN_TABLE_MAX_KEY_SIZE); + GRN_TEXT_INIT(&(learner->dataset_name), 0); + events_name_prefix_size = strlen("event_"); + if (events_name_size > events_name_prefix_size) { + GRN_TEXT_PUT(ctx, + &(learner->dataset_name), + events_name + events_name_prefix_size, + events_name_size - events_name_prefix_size); + } +} + +static void +learner_fin_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner) +{ + GRN_OBJ_FIN(ctx, &(learner->dataset_name)); +} + +static void +learner_init_configuration(grn_ctx *ctx, grn_suggest_learner *learner) +{ + learner->configuration = grn_ctx_get(ctx, "configuration", -1); +} + +static void +learner_fin_configuration(grn_ctx *ctx, grn_suggest_learner *learner) +{ + if (learner->configuration) { + grn_obj_unlink(ctx, learner->configuration); + } +} + +static void +learner_init_buffers(grn_ctx *ctx, grn_suggest_learner *learner) +{ + learner_init_weight(ctx, learner); + GRN_RECORD_INIT(&(learner->pre_events), 0, grn_obj_id(ctx, learner->events)); +} + +static void +learner_fin_buffers(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_unlink(ctx, &(learner->weight)); + grn_obj_unlink(ctx, &(learner->pre_events)); +} + +static void +learner_init_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_id items_id; + + learner->key_prefix = ((uint64_t)learner->post_item_id) << 32; + + items_id = grn_obj_get_range(ctx, learner->events_item); + GRN_RECORD_INIT(&(learner->pre_item), 0, items_id); + + grn_obj_get_value(ctx, learner->seqs_events, learner->seq_id, + &(learner->pre_events)); +} + +static void +learner_fin_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_unlink(ctx, &(learner->pre_item)); + GRN_BULK_REWIND(&(learner->pre_events)); +} + +static grn_bool +learner_is_valid_input(grn_ctx *ctx, grn_suggest_learner *learner) +{ + return learner->post_event_id && learner->post_item_id && learner->seq_id; +} + +static void +learner_increment(grn_ctx *ctx, grn_suggest_learner *learner, + grn_obj *column, grn_id record_id) +{ + grn_obj_set_value(ctx, column, record_id, &(learner->weight), GRN_OBJ_INCR); +} + +static void +learner_increment_item_freq(grn_ctx *ctx, grn_suggest_learner *learner, + grn_obj *column) +{ + learner_increment(ctx, learner, column, learner->post_item_id); +} + +static void +learner_set_last_post_time(grn_ctx *ctx, grn_suggest_learner *learner) +{ + grn_obj_set_value(ctx, learner->items_last, learner->post_item_id, + learner->post_time, GRN_OBJ_SET); +} + +static void +learner_learn_for_complete_and_correcnt(grn_ctx *ctx, + grn_suggest_learner *learner) +{ + grn_obj *pre_item, *post_item, *pre_events; + grn_obj pre_type, pre_time; + grn_id *ep, *es; + uint64_t key; + int64_t post_time_value; + + pre_item = &(learner->pre_item); + post_item = learner->post_item; + pre_events = &(learner->pre_events); + post_time_value = learner->post_time_value; + GRN_RECORD_INIT(&pre_type, 0, grn_obj_get_range(ctx, learner->events_type)); + GRN_TIME_INIT(&pre_time, 0); + ep = (grn_id *)GRN_BULK_CURR(pre_events); + es = (grn_id *)GRN_BULK_HEAD(pre_events); + while (es < ep--) { + grn_id pair_id; + int added; + int64_t learn_distance; + + GRN_BULK_REWIND(&pre_type); + GRN_BULK_REWIND(&pre_time); + GRN_BULK_REWIND(pre_item); + grn_obj_get_value(ctx, learner->events_type, *ep, &pre_type); + grn_obj_get_value(ctx, learner->events_time, *ep, &pre_time); + grn_obj_get_value(ctx, learner->events_item, *ep, pre_item); + learn_distance = post_time_value - GRN_TIME_VALUE(&pre_time); + if (learn_distance >= MIN_LEARN_DISTANCE) { + learner->learn_distance_in_seconds = + (int)(learn_distance / GRN_TIME_USEC_PER_SEC); + break; + } + key = learner->key_prefix + GRN_RECORD_VALUE(pre_item); + pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t), + &added); + if (added) { + grn_obj_set_value(ctx, learner->pairs_pre, pair_id, pre_item, + GRN_OBJ_SET); + grn_obj_set_value(ctx, learner->pairs_post, pair_id, post_item, + GRN_OBJ_SET); + } + if (GRN_RECORD_VALUE(&pre_type)) { + learner_increment(ctx, learner, learner->pairs_freq1, pair_id); + break; + } else { + learner_increment(ctx, learner, learner->pairs_freq0, pair_id); + } + } + GRN_OBJ_FIN(ctx, &pre_type); + GRN_OBJ_FIN(ctx, &pre_time); +} + +static void +learner_learn_for_suggest(grn_ctx *ctx, grn_suggest_learner *learner) +{ + char keybuf[GRN_TABLE_MAX_KEY_SIZE]; + int keylen = grn_table_get_key(ctx, learner->items, learner->post_item_id, + keybuf, GRN_TABLE_MAX_KEY_SIZE); + unsigned int token_flags = 0; + grn_token_cursor *token_cursor = + grn_token_cursor_open(ctx, learner->items, keybuf, keylen, + GRN_TOKEN_ADD, token_flags); + if (token_cursor) { + grn_id tid; + grn_obj *pre_item = &(learner->pre_item); + grn_obj *post_item = learner->post_item; + grn_hash *token_ids = NULL; + while ((tid = grn_token_cursor_next(ctx, token_cursor)) && tid != learner->post_item_id) { + uint64_t key; + int added; + grn_id pair_id; + key = learner->key_prefix + tid; + pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t), + &added); + if (added) { + GRN_RECORD_SET(ctx, pre_item, tid); + grn_obj_set_value(ctx, learner->pairs_pre, pair_id, + pre_item, GRN_OBJ_SET); + grn_obj_set_value(ctx, learner->pairs_post, pair_id, + post_item, GRN_OBJ_SET); + } + if (!token_ids) { + token_ids = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, + GRN_OBJ_TABLE_HASH_KEY|GRN_HASH_TINY); + } + if (token_ids) { + int token_added; + grn_hash_add(ctx, token_ids, &tid, sizeof(grn_id), NULL, &token_added); + if (token_added) { + learner_increment(ctx, learner, learner->pairs_freq2, pair_id); + } + } + } + if (token_ids) { + grn_hash_close(ctx, token_ids); + } + grn_token_cursor_close(ctx, token_cursor); + } +} + +static void +learner_append_post_event(grn_ctx *ctx, grn_suggest_learner *learner) +{ + GRN_RECORD_SET(ctx, &(learner->pre_events), learner->post_event_id); + grn_obj_set_value(ctx, learner->seqs_events, learner->seq_id, + &(learner->pre_events), GRN_OBJ_APPEND); +} + +static void +learner_learn(grn_ctx *ctx, grn_suggest_learner *learner) +{ + if (learner_is_valid_input(ctx, learner)) { + learner_init_columns(ctx, learner); + learner_init_dataset_name(ctx, learner); + learner_init_configuration(ctx, learner); + learner_init_buffers(ctx, learner); + learner_increment_item_freq(ctx, learner, learner->items_freq); + learner_set_last_post_time(ctx, learner); + if (learner->post_type_id) { + learner_init_submit_learn(ctx, learner); + learner_increment_item_freq(ctx, learner, learner->items_freq2); + learner_learn_for_complete_and_correcnt(ctx, learner); + learner_learn_for_suggest(ctx, learner); + learner_fin_submit_learn(ctx, learner); + } + learner_append_post_event(ctx, learner); + learner_fin_buffers(ctx, learner); + learner_fin_configuration(ctx, learner); + learner_fin_dataset_name(ctx, learner); + learner_fin_columns(ctx, learner); + } +} + +static grn_obj * +func_suggest_preparer(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + int learn_distance_in_seconds = 0; + grn_obj *obj; + if (nargs == 6) { + grn_obj *post_event = args[0]; + grn_obj *post_type = args[1]; + grn_obj *post_item = args[2]; + grn_obj *seq = args[3]; + grn_obj *post_time = args[4]; + grn_obj *pairs = args[5]; + grn_suggest_learner learner; + learner_init(ctx, &learner, + post_event, post_type, post_item, seq, post_time, pairs); + learner_learn(ctx, &learner); + learn_distance_in_seconds = learner.learn_distance_in_seconds; + } + if ((obj = GRN_PROC_ALLOC(GRN_DB_UINT32, 0))) { + GRN_UINT32_SET(ctx, obj, learn_distance_in_seconds); + } + return obj; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_expr_var vars[12]; + + grn_plugin_expr_var_init(ctx, &vars[0], "types", -1); + grn_plugin_expr_var_init(ctx, &vars[1], "table", -1); + grn_plugin_expr_var_init(ctx, &vars[2], "column", -1); + grn_plugin_expr_var_init(ctx, &vars[3], "query", -1); + grn_plugin_expr_var_init(ctx, &vars[4], "sortby", -1); + grn_plugin_expr_var_init(ctx, &vars[5], "output_columns", -1); + grn_plugin_expr_var_init(ctx, &vars[6], "offset", -1); + grn_plugin_expr_var_init(ctx, &vars[7], "limit", -1); + grn_plugin_expr_var_init(ctx, &vars[8], "frequency_threshold", -1); + grn_plugin_expr_var_init(ctx, &vars[9], "conditional_probability_threshold", -1); + grn_plugin_expr_var_init(ctx, &vars[10], "prefix_search", -1); + grn_plugin_expr_var_init(ctx, &vars[11], "similar_search", -1); + grn_plugin_command_create(ctx, "suggest", -1, command_suggest, 12, vars); + + grn_proc_create(ctx, CONST_STR_LEN("suggest_preparer"), GRN_PROC_FUNCTION, + func_suggest_preparer, NULL, NULL, 0, NULL); + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt new file mode 100644 index 00000000..4aa7d09b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/CMakeLists.txt @@ -0,0 +1,63 @@ +# Copyright(C) 2014 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(TOKEN_FILTERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/token_filters") + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stop_word_sources.am + STOP_WORD_SOURCES) +set_source_files_properties(${STOP_WORD_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +if(GRN_EMBED) + add_library(stop_word_token_filter STATIC ${STOP_WORD_SOURCES}) + set_target_properties( + stop_word_token_filter + PROPERTIES + POSITION_INDEPENDENT_CODE ON) +else() + add_library(stop_word_token_filter MODULE ${STOP_WORD_SOURCES}) + set_target_properties(stop_word_token_filter PROPERTIES + PREFIX "" + OUTPUT_NAME "stop_word") + install(TARGETS stop_word_token_filter DESTINATION "${TOKEN_FILTERS_DIR}") +endif() +target_link_libraries(stop_word_token_filter libgroonga) + +if(GRN_WITH_LIBSTEMMER) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stem_sources.am STEM_SOURCES) + include_directories(${LIBSTEMMER_INCLUDE_DIRS}) + link_directories(${LIBSTEMMER_LIBRARY_DIRS}) + set_source_files_properties(${STEM_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + if(GRN_EMBED) + add_library(stem_token_filter STATIC ${STEM_SOURCES}) + set_target_properties( + stem_token_filter + PROPERTIES + POSITION_INDEPENDENT_CODE ON) + else() + add_library(stem_token_filter MODULE ${STEM_SOURCES}) + set_target_properties(stem_token_filter PROPERTIES + PREFIX "" + OUTPUT_NAME "stem") + install(TARGETS stem_token_filter DESTINATION "${TOKEN_FILTERS_DIR}") + endif() + target_link_libraries(stem_token_filter libgroonga ${LIBSTEMMER_LIBRARIES}) +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am b/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am new file mode 100644 index 00000000..c63bef7a --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/Makefile.am @@ -0,0 +1,28 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +token_filter_plugins_LTLIBRARIES = +token_filter_plugins_LTLIBRARIES += stop_word.la +if WITH_LIBSTEMMER +token_filter_plugins_LTLIBRARIES += stem.la +endif + +include stop_word_sources.am + +include stem_sources.am +stem_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBSTEMMER_CFLAGS) +stem_la_LIBADD = $(LIBS) $(LIBSTEMMER_LIBS) +stem_la_LDFLAGS = $(AM_LDFLAGS) $(LIBSTEMMER_LDFLAGS) diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c b/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c new file mode 100644 index 00000000..2144eb09 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stem.c @@ -0,0 +1,279 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG token_filters_stem +#endif + +#include <grn_str.h> + +#include <groonga.h> +#include <groonga/token_filter.h> + +#include <ctype.h> +#include <string.h> + +#include <libstemmer.h> + +typedef struct { + struct sb_stemmer *stemmer; + grn_tokenizer_token token; + grn_obj buffer; +} grn_stem_token_filter; + +static void * +stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) +{ + grn_stem_token_filter *token_filter; + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate grn_stem_token_filter"); + return NULL; + } + + { + /* TODO: Support other languages. */ + const char *algorithm = "english"; + const char *encoding = "UTF_8"; + token_filter->stemmer = sb_stemmer_new(algorithm, encoding); + if (!token_filter->stemmer) { + GRN_PLUGIN_FREE(ctx, token_filter); + GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, + "[token-filter][stem] " + "failed to create stemmer: " + "algorithm=<%s>, encoding=<%s>", + algorithm, encoding); + return NULL; + } + } + grn_tokenizer_token_init(ctx, &(token_filter->token)); + GRN_TEXT_INIT(&(token_filter->buffer), 0); + + return token_filter; +} + +static grn_bool +is_stemmable(grn_obj *data, grn_bool *is_all_upper) +{ + const char *current, *end; + grn_bool have_lower = GRN_FALSE; + grn_bool have_upper = GRN_FALSE; + + *is_all_upper = GRN_FALSE; + + switch (data->header.domain) { + case GRN_DB_SHORT_TEXT : + case GRN_DB_TEXT : + case GRN_DB_LONG_TEXT : + break; + default : + return GRN_FALSE; + } + + current = GRN_TEXT_VALUE(data); + end = current + GRN_TEXT_LEN(data); + + for (; current < end; current++) { + if (islower((unsigned char)*current)) { + have_lower = GRN_TRUE; + continue; + } + if (isupper((unsigned char)*current)) { + have_upper = GRN_TRUE; + continue; + } + if (isdigit((unsigned char)*current)) { + continue; + } + switch (*current) { + case '-' : + case '\'' : + break; + default : + return GRN_FALSE; + } + } + + if (!have_lower && have_upper) { + *is_all_upper = GRN_TRUE; + } + + return GRN_TRUE; +} + +static void +normalize(grn_ctx *ctx, + const char *string, unsigned int length, + grn_obj *normalized) +{ + const char *current, *end; + const char *unwritten; + + current = unwritten = string; + end = current + length; + + for (; current < end; current++) { + if (isupper((unsigned char)*current)) { + if (current > unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } + GRN_TEXT_PUTC(ctx, normalized, tolower((unsigned char)*current)); + unwritten = current + 1; + } + } + + if (current != unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } +} + +static void +unnormalize(grn_ctx *ctx, + const char *string, unsigned int length, + grn_obj *normalized) +{ + const char *current, *end; + const char *unwritten; + + current = unwritten = string; + end = current + length; + + for (; current < end; current++) { + if (islower((unsigned char)*current)) { + if (current > unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } + GRN_TEXT_PUTC(ctx, normalized, toupper((unsigned char)*current)); + unwritten = current + 1; + } + } + + if (current != unwritten) { + GRN_TEXT_PUT(ctx, normalized, unwritten, current - unwritten); + } +} + +static void +stem_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) +{ + grn_stem_token_filter *token_filter = user_data; + grn_obj *data; + grn_bool is_all_upper = GRN_FALSE; + + if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { + return; + } + + data = grn_token_get_data(ctx, current_token); + if (!is_stemmable(data, &is_all_upper)) { + return; + } + + { + const sb_symbol *stemmed; + + if (is_all_upper) { + grn_obj *buffer; + buffer = &(token_filter->buffer); + GRN_BULK_REWIND(buffer); + normalize(ctx, + GRN_TEXT_VALUE(data), + GRN_TEXT_LEN(data), + buffer); + stemmed = sb_stemmer_stem(token_filter->stemmer, + GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer)); + if (stemmed) { + GRN_BULK_REWIND(buffer); + unnormalize(ctx, + stemmed, + sb_stemmer_length(token_filter->stemmer), + buffer); + grn_token_set_data(ctx, next_token, + GRN_TEXT_VALUE(buffer), GRN_TEXT_LEN(buffer)); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate memory for stemmed word: <%.*s> " + "(normalized: <%.*s>)", + (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data), + (int)GRN_TEXT_LEN(buffer), GRN_TEXT_VALUE(buffer)); + } + } else { + stemmed = sb_stemmer_stem(token_filter->stemmer, + GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); + if (stemmed) { + grn_token_set_data(ctx, next_token, + stemmed, + sb_stemmer_length(token_filter->stemmer)); + } else { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate memory for stemmed word: <%.*s>", + (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); + } + } + } +} + +static void +stem_fin(grn_ctx *ctx, void *user_data) +{ + grn_stem_token_filter *token_filter = user_data; + if (!token_filter) { + return; + } + + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + if (token_filter->stemmer) { + sb_stemmer_delete(token_filter->stemmer); + } + GRN_OBJ_FIN(ctx, &(token_filter->buffer)); + GRN_PLUGIN_FREE(ctx, token_filter); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_token_filter_register(ctx, + "TokenFilterStem", -1, + stem_init, + stem_filter, + stem_fin); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am b/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am new file mode 100644 index 00000000..d02a3952 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stem_sources.am @@ -0,0 +1,2 @@ +stem_la_SOURCES = \ + stem.c diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c new file mode 100644 index 00000000..a06d772f --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c @@ -0,0 +1,159 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG token_filters_stop_word +#endif + +#include <grn_str.h> + +#include <groonga.h> +#include <groonga/token_filter.h> + +#include <string.h> + +#define COLUMN_NAME "is_stop_word" + +typedef struct { + grn_obj *table; + grn_token_mode mode; + grn_obj *column; + grn_obj value; + grn_tokenizer_token token; +} grn_stop_word_token_filter; + +static void * +stop_word_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) +{ + grn_stop_word_token_filter *token_filter; + + if (mode != GRN_TOKEN_GET) { + return NULL; + } + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stop_word_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stop-word] " + "failed to allocate grn_stop_word_token_filter"); + return NULL; + } + + token_filter->table = table; + token_filter->mode = mode; + token_filter->column = grn_obj_column(ctx, + token_filter->table, + COLUMN_NAME, + strlen(COLUMN_NAME)); + if (!token_filter->column) { + char table_name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int table_name_size; + + table_name_size = grn_obj_name(ctx, + token_filter->table, + table_name, + GRN_TABLE_MAX_KEY_SIZE); + GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR, + "[token-filter][stop-word] " + "column for judging stop word doesn't exit: <%.*s.%s>", + table_name_size, + table_name, + COLUMN_NAME); + GRN_PLUGIN_FREE(ctx, token_filter); + return NULL; + } + + GRN_BOOL_INIT(&(token_filter->value), 0); + grn_tokenizer_token_init(ctx, &(token_filter->token)); + + return token_filter; +} + +static void +stop_word_filter(grn_ctx *ctx, + grn_token *current_token, + grn_token *next_token, + void *user_data) +{ + grn_stop_word_token_filter *token_filter = user_data; + grn_id id; + grn_obj *data; + + if (!token_filter) { + return; + } + + data = grn_token_get_data(ctx, current_token); + id = grn_table_get(ctx, + token_filter->table, + GRN_TEXT_VALUE(data), + GRN_TEXT_LEN(data)); + if (id != GRN_ID_NIL) { + GRN_BULK_REWIND(&(token_filter->value)); + grn_obj_get_value(ctx, + token_filter->column, + id, + &(token_filter->value)); + if (GRN_BOOL_VALUE(&(token_filter->value))) { + grn_tokenizer_status status; + status = grn_token_get_status(ctx, current_token); + status |= GRN_TOKEN_SKIP; + grn_token_set_status(ctx, next_token, status); + } + } +} + +static void +stop_word_fin(grn_ctx *ctx, void *user_data) +{ + grn_stop_word_token_filter *token_filter = user_data; + if (!token_filter) { + return; + } + + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + grn_obj_unlink(ctx, token_filter->column); + grn_obj_unlink(ctx, &(token_filter->value)); + GRN_PLUGIN_FREE(ctx, token_filter); +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_token_filter_register(ctx, + "TokenFilterStopWord", -1, + stop_word_init, + stop_word_filter, + stop_word_fin); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am new file mode 100644 index 00000000..bab89551 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word_sources.am @@ -0,0 +1,2 @@ +stop_word_la_SOURCES = \ + stop_word.c diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt new file mode 100644 index 00000000..26aadc4e --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright(C) 2012-2013 Brazil +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1 as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(TOKENIZERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/tokenizers") +if(GRN_WITH_MECAB) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/mecab_sources.am MECAB_SOURCES) + include_directories(${MECAB_INCLUDE_DIRS}) + link_directories(${MECAB_LIBRARY_DIRS}) + if(GRN_WITH_BUNDLED_MECAB) + set(GRN_BUNDLED_MECAB_RELATIVE_RC_PATH "${CONFIG_DIR}/mecabrc") + set(MECAB_COMPILE_DEFINITIONS + "GRN_WITH_BUNDLED_MECAB" + "GRN_BUNDLED_MECAB_RELATIVE_RC_PATH=\"${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\"" + "GRN_BUNDLED_MECAB_RC_PATH=\"${CMAKE_INSTALL_PREFIX}/${GRN_BUNDLED_MECAB_RELATIVE_RC_PATH}\"") + set_source_files_properties(${MECAB_SOURCES} + PROPERTIES + COMPILE_DEFINITIONS + "${MECAB_COMPILE_DEFINITIONS}") + endif() + set_source_files_properties(${MECAB_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") + if(GRN_EMBED) + add_library(mecab_tokenizer STATIC ${MECAB_SOURCES}) + set_target_properties( + mecab_tokenizer + PROPERTIES + POSITION_INDEPENDENT_CODE ON) + else() + add_library(mecab_tokenizer MODULE ${MECAB_SOURCES}) + set_target_properties(mecab_tokenizer PROPERTIES + PREFIX "" + OUTPUT_NAME "mecab") + install(TARGETS mecab_tokenizer DESTINATION "${TOKENIZERS_DIR}") + endif() + target_link_libraries(mecab_tokenizer libgroonga ${MECAB_LIBRARIES}) +endif() + +if(GRN_WITH_KYTEA) + read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/kytea_sources.am KYTEA_SOURCES) + include_directories(${KYTEA_INCLUDE_DIRS}) + link_directories(${KYTEA_LIBRARY_DIRS}) + set_source_files_properties(${KYTEA_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_CXX_COMPILE_FLAGS}") + if(GRN_EMBED) + add_library(kytea_tokenizer STATIC ${KYTEA_SOURCES}) + set_target_properties( + kytea_tokenizer + PROPERTIES + POSITION_INDEPENDENT_CODE ON) + else() + add_library(kytea_tokenizer MODULE ${KYTEA_SOURCES}) + set_target_properties(kytea_tokenizer PROPERTIES + PREFIX "" + OUTPUT_NAME "kytea") + install(TARGETS kytea_tokenizer DESTINATION "${TOKENIZERS_DIR}") + endif() + target_link_libraries(kytea_tokenizer libgroonga ${KYTEA_LIBRARIES}) +endif() diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am new file mode 100644 index 00000000..9e10612b --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/Makefile.am @@ -0,0 +1,33 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +tokenizer_plugins_LTLIBRARIES = +if WITH_MECAB +tokenizer_plugins_LTLIBRARIES += mecab.la +endif +if WITH_KYTEA +tokenizer_plugins_LTLIBRARIES += kytea.la +endif + +include mecab_sources.am +mecab_la_CPPFLAGS = $(AM_CPPFLAGS) $(MECAB_CPPFLAGS) +mecab_la_LIBADD = $(LIBS) $(MECAB_LIBS) +mecab_la_LDFLAGS = $(AM_LDFLAGS) $(MECAB_LDFLAGS) + +include kytea_sources.am +kytea_la_CPPFLAGS = $(AM_CPPFLAGS) $(KYTEA_CFLAGS) +kytea_la_LIBADD = $(LIBS) $(KYTEA_LIBS) +kytea_la_LDFLAGS = $(AM_LDFLAGS) $(KYTEA_LDFLAGS) diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp new file mode 100644 index 00000000..76d827c0 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea.cpp @@ -0,0 +1,358 @@ +/* -*- c-basic-offset: 2 -*- */ +/* Copyright(C) 2012 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG tokenizers_kytea +#endif + +#include <groonga/tokenizer.h> + +#include <kytea/kytea.h> +#include <kytea/string-util.h> + +#include <string.h> + +#include <string> +#include <vector> + +namespace { + +grn_plugin_mutex *kytea_mutex = NULL; +kytea::KyteaConfig *kytea_config = NULL; +kytea::Kytea *kytea_tagger = NULL; +kytea::StringUtil *kytea_util = NULL; + +void kytea_init(grn_ctx *ctx); +void kytea_fin(grn_ctx *ctx); + +void kytea_init(grn_ctx *ctx) { + if (kytea_mutex || kytea_config || kytea_tagger || kytea_util) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "TokenKytea is already initialized"); + return; + } + + kytea_mutex = grn_plugin_mutex_open(ctx); + if (!kytea_mutex) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "grn_plugin_mutex_open() failed"); + return; + } + + kytea::KyteaConfig * const config = static_cast<kytea::KyteaConfig *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::KyteaConfig))); + if (!config) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to kytea::KyteaConfig failed"); + return; + } + + try { + new (config) kytea::KyteaConfig; + kytea_config = config; + try { + kytea_config->setDebug(0); + kytea_config->setOnTraining(false); + kytea_config->parseRunCommandLine(0, NULL); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::KyteaConfig settings failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, config); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::KyteaConfig initialization failed"); + return; + } + + kytea::Kytea * const tagger = static_cast<kytea::Kytea *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(kytea::Kytea))); + if (!tagger) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to kytea::Kytea failed"); + return; + } + + try { + new (tagger) kytea::Kytea; + kytea_tagger = tagger; + try { + kytea_tagger->readModel(kytea_config->getModelFile().c_str()); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea::readModel() failed"); + return; + } + } catch (...) { + GRN_PLUGIN_FREE(ctx, tagger); + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea initialization failed"); + return; + } + + try { + kytea_util = kytea_tagger->getStringUtil(); + } catch (...) { + kytea_fin(ctx); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "kytea::Kytea::getStringUtil() failed"); + return; + } +} + +void kytea_fin(grn_ctx *ctx) { + kytea_util = NULL; + + if (kytea_tagger) { + kytea_tagger->~Kytea(); + GRN_PLUGIN_FREE(ctx, kytea_tagger); + kytea_tagger = NULL; + } + + if (kytea_config) { + kytea_config->~KyteaConfig(); + GRN_PLUGIN_FREE(ctx, kytea_config); + kytea_config = NULL; + } + + if (kytea_mutex) { + grn_plugin_mutex_close(ctx, kytea_mutex); + kytea_mutex = NULL; + } +} + +struct grn_tokenizer_kytea { + grn_tokenizer_query *query; + kytea::KyteaSentence sentence; + std::vector<std::string> tokens; + std::size_t id; + grn_tokenizer_token token; + const char *rest_query_string; + unsigned int rest_query_string_length; + + grn_tokenizer_kytea() : + query(NULL), + sentence(), + tokens(), + id(0), + token(), + rest_query_string(NULL) + { + } + ~grn_tokenizer_kytea() {} +}; + +void grn_tokenizer_kytea_init(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + new (tokenizer) grn_tokenizer_kytea; + grn_tokenizer_token_init(ctx, &tokenizer->token); +} + +void grn_tokenizer_kytea_fin(grn_ctx *ctx, grn_tokenizer_kytea *tokenizer) { + grn_tokenizer_token_fin(ctx, &tokenizer->token); + if (tokenizer->query) { + grn_tokenizer_query_close(ctx, tokenizer->query); + } + tokenizer->~grn_tokenizer_kytea(); +} + +grn_obj *grn_kytea_init(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + unsigned int normalizer_flags = 0; + grn_tokenizer_query * const query = + grn_tokenizer_query_open(ctx, num_args, args, normalizer_flags); + if (!query) { + return NULL; + } + + grn_tokenizer_kytea * const tokenizer = static_cast<grn_tokenizer_kytea *>( + GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_kytea))); + if (!tokenizer) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][kytea] " + "memory allocation to grn_tokenizer_kytea failed"); + return NULL; + } + + try { + grn_tokenizer_kytea_init(ctx, tokenizer); + } catch (...) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "tokenizer initialization failed"); + return NULL; + } + + tokenizer->query = query; + + grn_obj *normalized_query = query->normalized_query; + const char *normalized_string; + unsigned int normalized_string_length; + grn_string_get_normalized(ctx, + normalized_query, + &normalized_string, + &normalized_string_length, + NULL); + if (tokenizer->query->have_tokenized_delimiter) { + tokenizer->rest_query_string = normalized_string; + tokenizer->rest_query_string_length = normalized_string_length; + } else { + grn_plugin_mutex_lock(ctx, kytea_mutex); + try { + const std::string str(normalized_string, normalized_string_length); + const kytea::KyteaString &surface_str = kytea_util->mapString(str); + const kytea::KyteaString &normalized_str = kytea_util->normalize(surface_str); + tokenizer->sentence = kytea::KyteaSentence(surface_str, normalized_str); + kytea_tagger->calculateWS(tokenizer->sentence); + } catch (...) { + grn_plugin_mutex_unlock(ctx, kytea_mutex); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "tokenization failed"); + return NULL; + } + grn_plugin_mutex_unlock(ctx, kytea_mutex); + + try { + for (std::size_t i = 0; i < tokenizer->sentence.words.size(); ++i) { + const std::string &token = + kytea_util->showString(tokenizer->sentence.words[i].surface); + const char *ptr = token.c_str(); + unsigned int left = static_cast<unsigned int>(token.length()); + while (left > 0) { + const int char_length = + grn_tokenizer_charlen(ctx, ptr, left, query->encoding); + if ((char_length == 0) || + (grn_tokenizer_isspace(ctx, ptr, left, query->encoding) != 0)) { + break; + } + ptr += char_length; + left -= char_length; + } + if (left == 0) { + tokenizer->tokens.push_back(token); + } + } + } catch (...) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][kytea] " + "adjustment failed"); + return NULL; + } + } + + user_data->ptr = tokenizer; + return NULL; +} + +grn_obj *grn_kytea_next(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast<grn_tokenizer_kytea *>(user_data->ptr); + + if (tokenizer->query->have_tokenized_delimiter) { + unsigned int rest_query_string_length = + tokenizer->rest_query_string_length; + const char *rest_query_string = + grn_tokenizer_tokenized_delimiter_next(ctx, + &(tokenizer->token), + tokenizer->rest_query_string, + rest_query_string_length, + tokenizer->query->encoding); + if (rest_query_string) { + tokenizer->rest_query_string_length -= + rest_query_string - tokenizer->rest_query_string; + } + tokenizer->rest_query_string = rest_query_string; + } else { + const grn_tokenizer_status status = + ((tokenizer->id + 1) < tokenizer->tokens.size()) ? + GRN_TOKENIZER_CONTINUE : GRN_TOKENIZER_LAST; + if (tokenizer->id < tokenizer->tokens.size()) { + const std::string &token = tokenizer->tokens[tokenizer->id++]; + grn_tokenizer_token_push(ctx, &tokenizer->token, + token.c_str(), token.length(), status); + } else { + grn_tokenizer_token_push(ctx, &tokenizer->token, "", 0, status); + } + } + + return NULL; +} + +grn_obj *grn_kytea_fin(grn_ctx *ctx, int num_args, grn_obj **args, + grn_user_data *user_data) { + grn_tokenizer_kytea * const tokenizer = + static_cast<grn_tokenizer_kytea *>(user_data->ptr); + if (tokenizer) { + grn_tokenizer_kytea_fin(ctx, tokenizer); + GRN_PLUGIN_FREE(ctx, tokenizer); + } + return NULL; +} + +} // namespace + +extern "C" { + +/* + GRN_PLUGIN_INIT() is called to initialize this plugin. Note that an error + code must be set in `ctx->rc' on failure. + */ +grn_rc GRN_PLUGIN_INIT(grn_ctx *ctx) { + kytea_init(ctx); + return ctx->rc; +} + +/* + GRN_PLUGIN_REGISTER() registers this plugin to the database associated with + `ctx'. The registration requires the plugin name and the functions to be + called for tokenization. + */ +grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) { + return grn_tokenizer_register(ctx, "TokenKytea", 10, grn_kytea_init, + grn_kytea_next, grn_kytea_fin); +} + +/* + GRN_PLUGIN_FIN() is called to finalize the plugin that was initialized by + GRN_PLUGIN_INIT(). + */ +grn_rc GRN_PLUGIN_FIN(grn_ctx *ctx) { + kytea_fin(ctx); + return GRN_SUCCESS; +} + +} // extern "C" diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am new file mode 100644 index 00000000..182f3857 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/kytea_sources.am @@ -0,0 +1,2 @@ +kytea_la_SOURCES = \ + kytea.cpp diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c new file mode 100644 index 00000000..cabf2c94 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab.c @@ -0,0 +1,660 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2009-2016 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ + +#ifdef GRN_EMBEDDED +# define GRN_PLUGIN_FUNCTION_TAG tokenizers_mecab +#endif + +#include <grn_str.h> + +#include <groonga.h> +#include <groonga/tokenizer.h> + +#include <mecab.h> + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +static unsigned int sole_mecab_init_counter = 0; +static mecab_t *sole_mecab = NULL; +static grn_plugin_mutex *sole_mecab_mutex = NULL; +static grn_encoding sole_mecab_encoding = GRN_ENC_NONE; + +static grn_bool grn_mecab_chunked_tokenize_enabled = GRN_FALSE; +static int grn_mecab_chunk_size_threshold = 8192; + +typedef struct { + mecab_t *mecab; + grn_obj buf; + const char *next; + const char *end; + grn_tokenizer_query *query; + grn_tokenizer_token token; +} grn_mecab_tokenizer; + +static const char * +mecab_global_error_message(void) +{ + double version; + + version = atof(mecab_version()); + /* MeCab <= 0.993 doesn't support mecab_strerror(NULL). */ + if (version <= 0.993) { + return "Unknown"; + } + + return mecab_strerror(NULL); +} + + +static grn_encoding +translate_mecab_charset_to_grn_encoding(const char *charset) +{ + if (grn_strcasecmp(charset, "euc-jp") == 0) { + return GRN_ENC_EUC_JP; + } else if (grn_strcasecmp(charset, "utf-8") == 0 || + grn_strcasecmp(charset, "utf8") == 0) { + return GRN_ENC_UTF8; + } else if (grn_strcasecmp(charset, "shift_jis") == 0 || + grn_strcasecmp(charset, "shift-jis") == 0 || + grn_strcasecmp(charset, "sjis") == 0) { + return GRN_ENC_SJIS; + } + return GRN_ENC_NONE; +} + +static grn_encoding +get_mecab_encoding(mecab_t *mecab) +{ + grn_encoding encoding = GRN_ENC_NONE; + const mecab_dictionary_info_t *dictionary_info; + dictionary_info = mecab_dictionary_info(mecab); + if (dictionary_info) { + const char *charset = dictionary_info->charset; + encoding = translate_mecab_charset_to_grn_encoding(charset); + } + return encoding; +} + +static inline grn_bool +is_delimiter_character(grn_ctx *ctx, const char *character, int character_bytes) +{ + switch (character_bytes) { + case 1 : + switch (character[0]) { + case ',' : + case '.' : + case '!' : + case '?' : + return GRN_TRUE; + default : + return GRN_FALSE; + } + case 3 : + switch ((unsigned char)(character[0])) { + case 0xE3 : + switch ((unsigned char)(character[1])) { + case 0x80 : + switch ((unsigned char)(character[2])) { + case 0x81 : /* U+3001 (0xE3 0x80 0x81 in UTF-8) IDEOGRAPHIC COMMA */ + case 0x82 : /* U+3002 (0xE3 0x80 0x82 in UTF-8) IDEOGRAPHIC FULL STOP */ + return GRN_TRUE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } + return GRN_FALSE; + case 0xEF : + switch ((unsigned char)(character[1])) { + case 0xBC : + switch ((unsigned char)(character[2])) { + case 0x81 : + /* U+FF01 (0xEF 0xBC 0x81 in UTF-8) FULLWIDTH EXCLAMATION MARK */ + case 0x9F : + /* U+FF1F (0xEF 0xBC 0x9F in UTF-8) FULLWIDTH QUESTION MARK */ + return GRN_TRUE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } + return GRN_FALSE; + default : + return GRN_FALSE; + } + default : + return GRN_FALSE; + } +} + +static grn_bool +chunked_tokenize_utf8_chunk(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char *chunk, + unsigned int chunk_bytes) +{ + const char *tokenized_chunk; + size_t tokenized_chunk_length; + + tokenized_chunk = mecab_sparse_tostr2(tokenizer->mecab, chunk, chunk_bytes); + if (!tokenized_chunk) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab][chunk] " + "mecab_sparse_tostr2() failed len=%d err=%s", + chunk_bytes, + mecab_strerror(tokenizer->mecab)); + return GRN_FALSE; + } + + if (GRN_TEXT_LEN(&(tokenizer->buf)) > 0) { + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), " "); + } + + tokenized_chunk_length = strlen(tokenized_chunk); + if (tokenized_chunk_length >= 1 && + isspace((unsigned char)tokenized_chunk[tokenized_chunk_length - 1])) { + GRN_TEXT_PUT(ctx, &(tokenizer->buf), + tokenized_chunk, tokenized_chunk_length - 1); + } else { + GRN_TEXT_PUT(ctx, &(tokenizer->buf), + tokenized_chunk, tokenized_chunk_length); + } + + return GRN_TRUE; +} + +static grn_bool +chunked_tokenize_utf8(grn_ctx *ctx, + grn_mecab_tokenizer *tokenizer, + const char *string, + unsigned int string_bytes) +{ + const char *chunk_start; + const char *current; + const char *last_delimiter; + const char *string_end = string + string_bytes; + grn_encoding encoding = tokenizer->query->encoding; + + if (string_bytes < grn_mecab_chunk_size_threshold) { + return chunked_tokenize_utf8_chunk(ctx, + tokenizer, + string, + string_bytes); + } + + chunk_start = current = string; + last_delimiter = NULL; + while (current < string_end) { + int space_bytes; + int character_bytes; + const char *current_character; + + space_bytes = grn_isspace(current, encoding); + if (space_bytes > 0) { + if (chunk_start != current) { + grn_bool succeeded; + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + if (!succeeded) { + return succeeded; + } + } + current += space_bytes; + chunk_start = current; + last_delimiter = NULL; + continue; + } + + character_bytes = grn_charlen_(ctx, current, string_end, encoding); + if (character_bytes == 0) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab][chunk] " + "invalid byte sequence: position=%d", + (int)(current - string)); + return GRN_FALSE; + } + + current_character = current; + current += character_bytes; + if (is_delimiter_character(ctx, current_character, character_bytes)) { + last_delimiter = current; + } + + if ((current - chunk_start) >= grn_mecab_chunk_size_threshold) { + grn_bool succeeded; + if (last_delimiter) { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + last_delimiter - chunk_start); + chunk_start = last_delimiter; + } else { + succeeded = chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + chunk_start = current; + } + if (!succeeded) { + return succeeded; + } + last_delimiter = NULL; + } + } + + if (current == chunk_start) { + return GRN_TRUE; + } else { + return chunked_tokenize_utf8_chunk(ctx, + tokenizer, + chunk_start, + current - chunk_start); + } +} + +static mecab_t * +mecab_create(grn_ctx *ctx) +{ + mecab_t *mecab; + int argc = 0; + const char *argv[4]; + + argv[argc++] = "Groonga"; + argv[argc++] = "-Owakati"; +#ifdef GRN_WITH_BUNDLED_MECAB + argv[argc++] = "--rcfile"; +# ifdef WIN32 + { + static char windows_mecab_rc_file[PATH_MAX]; + + grn_strcpy(windows_mecab_rc_file, + PATH_MAX, + grn_plugin_windows_base_dir()); + grn_strcat(windows_mecab_rc_file, + PATH_MAX, + "/"); + grn_strcat(windows_mecab_rc_file, + PATH_MAX, + GRN_BUNDLED_MECAB_RELATIVE_RC_PATH); + { + char *c; + for (c = windows_mecab_rc_file; *c != '\0'; c++) { + if (*c == '/') { + *c = '\\'; + } + } + } + argv[argc++] = windows_mecab_rc_file; + } +# else /* WIN32 */ + argv[argc++] = GRN_BUNDLED_MECAB_RC_PATH; +# endif /* WIN32 */ +#endif /* GRN_WITH_BUNDLED_MECAB */ + mecab = mecab_new(argc, (char **)argv); + + if (!mecab) { +#ifdef GRN_WITH_BUNDLED_MECAB + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] failed to create mecab_t: %s: " + "mecab_new(\"%s\", \"%s\", \"%s\", \"%s\")", + mecab_global_error_message(), + argv[0], argv[1], argv[2], argv[3]); +#else /* GRN_WITH_BUNDLED_MECAB */ + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] failed to create mecab_t: %s: " + "mecab_new(\"%s\", \"%s\")", + mecab_global_error_message(), + argv[0], argv[1]); +#endif /* GRN_WITH_BUNDLED_MECAB */ + } + + return mecab; +} + +/* + This function is called for a full text search query or a document to be + indexed. This means that both short/long strings are given. + The return value of this function is ignored. When an error occurs in this + function, `ctx->rc' is overwritten with an error code (not GRN_SUCCESS). + */ +static grn_obj * +mecab_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_mecab_tokenizer *tokenizer; + unsigned int normalizer_flags = 0; + grn_tokenizer_query *query; + grn_obj *normalized_query; + const char *normalized_string; + unsigned int normalized_string_length; + + query = grn_tokenizer_query_open(ctx, nargs, args, normalizer_flags); + if (!query) { + return NULL; + } + if (!sole_mecab) { + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + if (!sole_mecab) { + sole_mecab = mecab_create(ctx); + if (sole_mecab) { + sole_mecab_encoding = get_mecab_encoding(sole_mecab); + } + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + } + if (!sole_mecab) { + grn_tokenizer_query_close(ctx, query); + return NULL; + } + + if (query->encoding != sole_mecab_encoding) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab dictionary charset (%s) does not match " + "the table encoding: <%s>", + grn_encoding_to_string(sole_mecab_encoding), + grn_encoding_to_string(query->encoding)); + return NULL; + } + + if (!(tokenizer = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_mecab_tokenizer)))) { + grn_tokenizer_query_close(ctx, query); + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] " + "memory allocation to grn_mecab_tokenizer failed"); + return NULL; + } + tokenizer->mecab = sole_mecab; + tokenizer->query = query; + + normalized_query = query->normalized_query; + grn_string_get_normalized(ctx, + normalized_query, + &normalized_string, + &normalized_string_length, + NULL); + GRN_TEXT_INIT(&(tokenizer->buf), 0); + if (query->have_tokenized_delimiter) { + tokenizer->next = normalized_string; + tokenizer->end = tokenizer->next + normalized_string_length; + } else if (normalized_string_length == 0) { + tokenizer->next = ""; + tokenizer->end = tokenizer->next; + } else { + grn_bool succeeded; + grn_plugin_mutex_lock(ctx, sole_mecab_mutex); + if (grn_mecab_chunked_tokenize_enabled && + ctx->encoding == GRN_ENC_UTF8) { + succeeded = chunked_tokenize_utf8(ctx, + tokenizer, + normalized_string, + normalized_string_length); + } else { + const char *s; + s = mecab_sparse_tostr2(tokenizer->mecab, + normalized_string, + normalized_string_length); + if (!s) { + succeeded = GRN_FALSE; + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "mecab_sparse_tostr() failed len=%d err=%s", + normalized_string_length, + mecab_strerror(tokenizer->mecab)); + } else { + succeeded = GRN_TRUE; + GRN_TEXT_PUTS(ctx, &(tokenizer->buf), s); + } + } + grn_plugin_mutex_unlock(ctx, sole_mecab_mutex); + if (!succeeded) { + grn_tokenizer_query_close(ctx, tokenizer->query); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; + } + { + char *buf, *p; + unsigned int bufsize; + + buf = GRN_TEXT_VALUE(&(tokenizer->buf)); + bufsize = GRN_TEXT_LEN(&(tokenizer->buf)); + /* A certain version of mecab returns trailing lf or spaces. */ + for (p = buf + bufsize - 2; + buf <= p && isspace(*(unsigned char *)p); + p--) { *p = '\0'; } + tokenizer->next = buf; + tokenizer->end = p + 1; + } + } + user_data->ptr = tokenizer; + + grn_tokenizer_token_init(ctx, &(tokenizer->token)); + + return NULL; +} + +/* + This function returns tokens one by one. + */ +static grn_obj * +mecab_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + /* grn_obj *table = args[0]; */ + grn_mecab_tokenizer *tokenizer = user_data->ptr; + grn_encoding encoding = tokenizer->query->encoding; + + if (tokenizer->query->have_tokenized_delimiter) { + tokenizer->next = + grn_tokenizer_tokenized_delimiter_next(ctx, + &(tokenizer->token), + tokenizer->next, + tokenizer->end - tokenizer->next, + encoding); + } else { + size_t cl; + const char *p = tokenizer->next, *r; + const char *e = tokenizer->end; + grn_tokenizer_status status; + + for (r = p; r < e; r += cl) { + int space_len; + + space_len = grn_isspace(r, encoding); + if (space_len > 0 && r == p) { + cl = space_len; + p = r + cl; + continue; + } + + if (!(cl = grn_charlen_(ctx, r, e, encoding))) { + tokenizer->next = e; + break; + } + + if (space_len > 0) { + const char *q = r + space_len; + while (q < e && (space_len = grn_isspace(q, encoding))) { + q += space_len; + } + tokenizer->next = q; + break; + } + } + + if (r == e || tokenizer->next == e) { + status = GRN_TOKENIZER_LAST; + } else { + status = GRN_TOKENIZER_CONTINUE; + } + grn_tokenizer_token_push(ctx, &(tokenizer->token), p, r - p, status); + } + + return NULL; +} + +/* + This function finalizes a tokenization. + */ +static grn_obj * +mecab_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_mecab_tokenizer *tokenizer = user_data->ptr; + if (!tokenizer) { + return NULL; + } + grn_tokenizer_token_fin(ctx, &(tokenizer->token)); + grn_tokenizer_query_close(ctx, tokenizer->query); + grn_obj_unlink(ctx, &(tokenizer->buf)); + GRN_PLUGIN_FREE(ctx, tokenizer); + return NULL; +} + +static void +check_mecab_dictionary_encoding(grn_ctx *ctx) +{ +#ifdef HAVE_MECAB_DICTIONARY_INFO_T + mecab_t *mecab; + grn_encoding encoding; + grn_bool have_same_encoding_dictionary; + + mecab = mecab_create(ctx); + if (!mecab) { + return; + } + + encoding = GRN_CTX_GET_ENCODING(ctx); + have_same_encoding_dictionary = (encoding == get_mecab_encoding(mecab)); + mecab_destroy(mecab); + + if (!have_same_encoding_dictionary) { + GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, + "[tokenizer][mecab] " + "MeCab has no dictionary that uses the context encoding" + ": <%s>", + grn_encoding_to_string(encoding)); + } +#endif +} + +/* + This function initializes a plugin. This function fails if there is no + dictionary that uses the context encoding of groonga. + */ +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + ++sole_mecab_init_counter; + if (sole_mecab_init_counter > 1) + { + return GRN_SUCCESS; + } + { + char env[GRN_ENV_BUFFER_SIZE]; + + grn_getenv("GRN_MECAB_CHUNKED_TOKENIZE_ENABLED", + env, + GRN_ENV_BUFFER_SIZE); + grn_mecab_chunked_tokenize_enabled = (env[0] && strcmp(env, "yes") == 0); + } + + { + char env[GRN_ENV_BUFFER_SIZE]; + + grn_getenv("GRN_MECAB_CHUNK_SIZE_THRESHOLD", + env, + GRN_ENV_BUFFER_SIZE); + if (env[0]) { + int threshold = -1; + const char *end; + const char *rest; + + end = env + strlen(env); + threshold = grn_atoi(env, end, &rest); + if (end > env && end == rest) { + grn_mecab_chunk_size_threshold = threshold; + } + } + } + + sole_mecab = NULL; + sole_mecab_mutex = grn_plugin_mutex_open(ctx); + if (!sole_mecab_mutex) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[tokenizer][mecab] grn_plugin_mutex_open() failed"); + return ctx->rc; + } + + check_mecab_dictionary_encoding(ctx); + if (ctx->rc != GRN_SUCCESS) { + grn_plugin_mutex_close(ctx, sole_mecab_mutex); + sole_mecab_mutex = NULL; + } + + return ctx->rc; +} + +/* + This function registers a plugin to a database. + */ +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_tokenizer_register(ctx, "TokenMecab", 10, + mecab_init, mecab_next, mecab_fin); + if (rc == GRN_SUCCESS) { + grn_obj *token_mecab; + token_mecab = grn_ctx_get(ctx, "TokenMecab", 10); + /* Just for backward compatibility. TokenMecab was built-in not plugin. */ + if (token_mecab && grn_obj_id(ctx, token_mecab) != GRN_DB_MECAB) { + rc = GRN_FILE_CORRUPT; + } + } + + return rc; +} + +/* + This function finalizes a plugin. + */ +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + --sole_mecab_init_counter; + if (sole_mecab_init_counter > 0) + { + return GRN_SUCCESS; + } + if (sole_mecab) { + mecab_destroy(sole_mecab); + sole_mecab = NULL; + } + if (sole_mecab_mutex) { + grn_plugin_mutex_close(ctx, sole_mecab_mutex); + sole_mecab_mutex = NULL; + } + + return GRN_SUCCESS; +} diff --git a/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am new file mode 100644 index 00000000..56912727 --- /dev/null +++ b/storage/mroonga/vendor/groonga/plugins/tokenizers/mecab_sources.am @@ -0,0 +1,2 @@ +mecab_la_SOURCES = \ + mecab.c |