summaryrefslogtreecommitdiffstats
path: root/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c')
-rw-r--r--storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c433
1 files changed, 433 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c b/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c
new file mode 100644
index 00000000..206ebf58
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/lib/proc/proc_tokenize.c
@@ -0,0 +1,433 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2009-2016 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*/
+
+#include "../grn_proc.h"
+#include "../grn_ctx.h"
+#include "../grn_token_cursor.h"
+
+#include <groonga/plugin.h>
+
+static unsigned int
+parse_tokenize_flags(grn_ctx *ctx, grn_obj *flag_names)
+{
+ unsigned int flags = 0;
+ const char *names, *names_end;
+ int length;
+
+ names = GRN_TEXT_VALUE(flag_names);
+ length = GRN_TEXT_LEN(flag_names);
+ names_end = names + length;
+ while (names < names_end) {
+ if (*names == '|' || *names == ' ') {
+ names += 1;
+ continue;
+ }
+
+#define CHECK_FLAG(name)\
+ if (((unsigned long) (names_end - names) >= (unsigned long) (sizeof(#name) - 1)) &&\
+ (!memcmp(names, #name, sizeof(#name) - 1))) {\
+ flags |= GRN_TOKEN_CURSOR_ ## name;\
+ names += sizeof(#name) - 1;\
+ continue;\
+ }
+
+ CHECK_FLAG(ENABLE_TOKENIZED_DELIMITER);
+
+#define GRN_TOKEN_CURSOR_NONE 0
+ CHECK_FLAG(NONE);
+#undef GRN_TOKEN_CURSOR_NONE
+
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[tokenize] invalid flag: <%.*s>",
+ (int)(names_end - names), names);
+ return 0;
+#undef CHECK_FLAG
+ }
+
+ return flags;
+}
+
+typedef struct {
+ grn_id id;
+ int32_t position;
+ grn_bool force_prefix;
+} tokenize_token;
+
+static void
+output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon, grn_obj *index_column)
+{
+ int i, n_tokens, n_elements;
+ grn_obj estimated_size;
+
+ n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token);
+ n_elements = 3;
+ if (index_column) {
+ n_elements++;
+ GRN_UINT32_INIT(&estimated_size, 0);
+ }
+
+ grn_ctx_output_array_open(ctx, "TOKENS", n_tokens);
+ for (i = 0; i < n_tokens; i++) {
+ tokenize_token *token;
+ char value[GRN_TABLE_MAX_KEY_SIZE];
+ unsigned int value_size;
+
+ token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i;
+
+ grn_ctx_output_map_open(ctx, "TOKEN", n_elements);
+
+ grn_ctx_output_cstr(ctx, "value");
+ value_size = grn_table_get_key(ctx, lexicon, token->id,
+ value, GRN_TABLE_MAX_KEY_SIZE);
+ grn_ctx_output_str(ctx, value, value_size);
+
+ grn_ctx_output_cstr(ctx, "position");
+ grn_ctx_output_int32(ctx, token->position);
+
+ grn_ctx_output_cstr(ctx, "force_prefix");
+ grn_ctx_output_bool(ctx, token->force_prefix);
+
+ if (index_column) {
+ GRN_BULK_REWIND(&estimated_size);
+ grn_obj_get_value(ctx, index_column, token->id, &estimated_size);
+ grn_ctx_output_cstr(ctx, "estimated_size");
+ grn_ctx_output_int64(ctx, GRN_UINT32_VALUE(&estimated_size));
+ }
+
+ grn_ctx_output_map_close(ctx);
+ }
+
+ if (index_column) {
+ GRN_OBJ_FIN(ctx, &estimated_size);
+ }
+
+ grn_ctx_output_array_close(ctx);
+}
+
+static grn_obj *
+create_lexicon_for_tokenize(grn_ctx *ctx,
+ grn_obj *tokenizer_name,
+ grn_obj *normalizer_name,
+ grn_obj *token_filter_names)
+{
+ grn_obj *lexicon;
+ grn_obj *tokenizer;
+ grn_obj *normalizer = NULL;
+
+ tokenizer = grn_ctx_get(ctx,
+ GRN_TEXT_VALUE(tokenizer_name),
+ GRN_TEXT_LEN(tokenizer_name));
+ if (!tokenizer) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[tokenize] nonexistent tokenizer: <%.*s>",
+ (int)GRN_TEXT_LEN(tokenizer_name),
+ GRN_TEXT_VALUE(tokenizer_name));
+ return NULL;
+ }
+
+ if (!grn_obj_is_tokenizer_proc(ctx, tokenizer)) {
+ grn_obj inspected;
+ GRN_TEXT_INIT(&inspected, 0);
+ grn_inspect(ctx, &inspected, tokenizer);
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[tokenize] not tokenizer: %.*s",
+ (int)GRN_TEXT_LEN(&inspected),
+ GRN_TEXT_VALUE(&inspected));
+ GRN_OBJ_FIN(ctx, &inspected);
+ grn_obj_unlink(ctx, tokenizer);
+ return NULL;
+ }
+
+ if (GRN_TEXT_LEN(normalizer_name) > 0) {
+ normalizer = grn_ctx_get(ctx,
+ GRN_TEXT_VALUE(normalizer_name),
+ GRN_TEXT_LEN(normalizer_name));
+ if (!normalizer) {
+ grn_obj_unlink(ctx, tokenizer);
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[tokenize] nonexistent normalizer: <%.*s>",
+ (int)GRN_TEXT_LEN(normalizer_name),
+ GRN_TEXT_VALUE(normalizer_name));
+ return NULL;
+ }
+
+ if (!grn_obj_is_normalizer_proc(ctx, normalizer)) {
+ grn_obj inspected;
+ grn_obj_unlink(ctx, tokenizer);
+ GRN_TEXT_INIT(&inspected, 0);
+ grn_inspect(ctx, &inspected, normalizer);
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[tokenize] not normalizer: %.*s",
+ (int)GRN_TEXT_LEN(&inspected),
+ GRN_TEXT_VALUE(&inspected));
+ GRN_OBJ_FIN(ctx, &inspected);
+ grn_obj_unlink(ctx, normalizer);
+ return NULL;
+ }
+ }
+
+ lexicon = grn_table_create(ctx, NULL, 0,
+ NULL,
+ GRN_OBJ_TABLE_HASH_KEY,
+ grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
+ NULL);
+ grn_obj_set_info(ctx, lexicon,
+ GRN_INFO_DEFAULT_TOKENIZER, tokenizer);
+ grn_obj_unlink(ctx, tokenizer);
+ if (normalizer) {
+ grn_obj_set_info(ctx, lexicon,
+ GRN_INFO_NORMALIZER, normalizer);
+ grn_obj_unlink(ctx, normalizer);
+ }
+ grn_proc_table_set_token_filters(ctx, lexicon, token_filter_names);
+
+ return lexicon;
+}
+
+static void
+tokenize(grn_ctx *ctx, grn_obj *lexicon, grn_obj *string, grn_tokenize_mode mode,
+ unsigned int flags, grn_obj *tokens)
+{
+ grn_token_cursor *token_cursor;
+
+ token_cursor =
+ grn_token_cursor_open(ctx, lexicon,
+ GRN_TEXT_VALUE(string), GRN_TEXT_LEN(string),
+ mode, flags);
+ if (!token_cursor) {
+ return;
+ }
+
+ while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) {
+ grn_id token_id = grn_token_cursor_next(ctx, token_cursor);
+ tokenize_token *current_token;
+ if (token_id == GRN_ID_NIL) {
+ continue;
+ }
+ grn_bulk_space(ctx, tokens, sizeof(tokenize_token));
+ current_token = ((tokenize_token *)(GRN_BULK_CURR(tokens))) - 1;
+ current_token->id = token_id;
+ current_token->position = token_cursor->pos;
+ current_token->force_prefix = token_cursor->force_prefix;
+ }
+ grn_token_cursor_close(ctx, token_cursor);
+}
+
+static grn_obj *
+command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ grn_obj *table_name;
+ grn_obj *string;
+ grn_obj *flag_names;
+ grn_obj *mode_name;
+ grn_obj *index_column_name;
+
+ table_name = grn_plugin_proc_get_var(ctx, user_data, "table", -1);
+ string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
+ flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
+ mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
+ index_column_name = grn_plugin_proc_get_var(ctx, user_data, "index_column", -1);
+
+ if (GRN_TEXT_LEN(table_name) == 0) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] table name is missing");
+ return NULL;
+ }
+
+ if (GRN_TEXT_LEN(string) == 0) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[table_tokenize] string is missing");
+ return NULL;
+ }
+
+ {
+ unsigned int flags;
+ grn_obj *lexicon;
+ grn_obj *index_column = NULL;
+
+ flags = parse_tokenize_flags(ctx, flag_names);
+ if (ctx->rc != GRN_SUCCESS) {
+ return NULL;
+ }
+
+ lexicon = grn_ctx_get(ctx, GRN_TEXT_VALUE(table_name), GRN_TEXT_LEN(table_name));
+ if (!lexicon) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[table_tokenize] nonexistent lexicon: <%.*s>",
+ (int)GRN_TEXT_LEN(table_name),
+ GRN_TEXT_VALUE(table_name));
+ return NULL;
+ }
+
+#define MODE_NAME_EQUAL(name)\
+ (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
+ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
+
+ if (GRN_TEXT_LEN(index_column_name) > 0) {
+ index_column = grn_obj_column(ctx, lexicon,
+ GRN_TEXT_VALUE(index_column_name),
+ GRN_TEXT_LEN(index_column_name));
+ if (!index_column) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[table_tokenize] nonexistent index column: <%.*s>",
+ (int)GRN_TEXT_LEN(index_column_name),
+ GRN_TEXT_VALUE(index_column_name));
+ goto exit;
+ }
+ if (index_column->header.type != GRN_COLUMN_INDEX) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[table_tokenize] index column must be COLUMN_INDEX: <%.*s>",
+ (int)GRN_TEXT_LEN(index_column_name),
+ GRN_TEXT_VALUE(index_column_name));
+ goto exit;
+ }
+ }
+
+ {
+ grn_obj tokens;
+ GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+ if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("GET")) {
+ tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
+ output_tokens(ctx, &tokens, lexicon, index_column);
+ } else if (MODE_NAME_EQUAL("ADD")) {
+ tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+ output_tokens(ctx, &tokens, lexicon, index_column);
+ } else {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[table_tokenize] invalid mode: <%.*s>",
+ (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
+ }
+ GRN_OBJ_FIN(ctx, &tokens);
+ }
+#undef MODE_NAME_EQUAL
+
+exit:
+ grn_obj_unlink(ctx, lexicon);
+ if (index_column) {
+ grn_obj_unlink(ctx, index_column);
+ }
+ }
+
+ return NULL;
+}
+
+void
+grn_proc_init_table_tokenize(grn_ctx *ctx)
+{
+ grn_expr_var vars[5];
+
+ grn_plugin_expr_var_init(ctx, &(vars[0]), "table", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[2]), "flags", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[3]), "mode", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[4]), "index_column", -1);
+ grn_plugin_command_create(ctx,
+ "table_tokenize", -1,
+ command_table_tokenize,
+ 5,
+ vars);
+}
+
+static grn_obj *
+command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+ grn_obj *tokenizer_name;
+ grn_obj *string;
+ grn_obj *normalizer_name;
+ grn_obj *flag_names;
+ grn_obj *mode_name;
+ grn_obj *token_filter_names;
+
+ tokenizer_name = grn_plugin_proc_get_var(ctx, user_data, "tokenizer", -1);
+ string = grn_plugin_proc_get_var(ctx, user_data, "string", -1);
+ normalizer_name = grn_plugin_proc_get_var(ctx, user_data, "normalizer", -1);
+ flag_names = grn_plugin_proc_get_var(ctx, user_data, "flags", -1);
+ mode_name = grn_plugin_proc_get_var(ctx, user_data, "mode", -1);
+ token_filter_names = grn_plugin_proc_get_var(ctx, user_data, "token_filters", -1);
+
+ if (GRN_TEXT_LEN(tokenizer_name) == 0) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] tokenizer name is missing");
+ return NULL;
+ }
+
+ if (GRN_TEXT_LEN(string) == 0) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[tokenize] string is missing");
+ return NULL;
+ }
+
+ {
+ unsigned int flags;
+ grn_obj *lexicon;
+
+ flags = parse_tokenize_flags(ctx, flag_names);
+ if (ctx->rc != GRN_SUCCESS) {
+ return NULL;
+ }
+
+ lexicon = create_lexicon_for_tokenize(ctx,
+ tokenizer_name,
+ normalizer_name,
+ token_filter_names);
+ if (!lexicon) {
+ return NULL;
+ }
+#define MODE_NAME_EQUAL(name)\
+ (GRN_TEXT_LEN(mode_name) == strlen(name) &&\
+ memcmp(GRN_TEXT_VALUE(mode_name), name, strlen(name)) == 0)
+
+ {
+ grn_obj tokens;
+ GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
+ if (GRN_TEXT_LEN(mode_name) == 0 || MODE_NAME_EQUAL("ADD")) {
+ tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+ output_tokens(ctx, &tokens, lexicon, NULL);
+ } else if (MODE_NAME_EQUAL("GET")) {
+ tokenize(ctx, lexicon, string, GRN_TOKEN_ADD, flags, &tokens);
+ GRN_BULK_REWIND(&tokens);
+ tokenize(ctx, lexicon, string, GRN_TOKEN_GET, flags, &tokens);
+ output_tokens(ctx, &tokens, lexicon, NULL);
+ } else {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
+ "[tokenize] invalid mode: <%.*s>",
+ (int)GRN_TEXT_LEN(mode_name), GRN_TEXT_VALUE(mode_name));
+ }
+ GRN_OBJ_FIN(ctx, &tokens);
+ }
+#undef MODE_NAME_EQUAL
+
+ grn_obj_unlink(ctx, lexicon);
+ }
+
+ return NULL;
+}
+
+void
+grn_proc_init_tokenize(grn_ctx *ctx)
+{
+ grn_expr_var vars[6];
+
+ grn_plugin_expr_var_init(ctx, &(vars[0]), "tokenizer", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[1]), "string", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[2]), "normalizer", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[3]), "flags", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[4]), "mode", -1);
+ grn_plugin_expr_var_init(ctx, &(vars[5]), "token_filters", -1);
+ grn_plugin_command_create(ctx,
+ "tokenize", -1,
+ command_tokenize,
+ 6,
+ vars);
+}