diff options
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/token_cursor.c')
-rw-r--r-- | storage/mroonga/vendor/groonga/lib/token_cursor.c | 386 |
1 files changed, 386 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/token_cursor.c b/storage/mroonga/vendor/groonga/lib/token_cursor.c new file mode 100644 index 00000000..179d0f31 --- /dev/null +++ b/storage/mroonga/vendor/groonga/lib/token_cursor.c @@ -0,0 +1,386 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2009-2017 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*/ +#include "grn_token_cursor.h" +#include "grn_string.h" +#include "grn_pat.h" +#include "grn_dat.h" + +static void +grn_token_cursor_open_initialize_token_filters(grn_ctx *ctx, + grn_token_cursor *token_cursor) +{ + grn_obj *token_filters = token_cursor->token_filter.objects; + unsigned int i, n_token_filters; + + token_cursor->token_filter.data = NULL; + + if (token_filters) { + n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); + } else { + n_token_filters = 0; + } + + if (n_token_filters == 0) { + return; + } + + token_cursor->token_filter.data = GRN_CALLOC(sizeof(void *) * n_token_filters); + if (!token_cursor->token_filter.data) { + return; + } + + for (i = 0; i < n_token_filters; i++) { + grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); + grn_proc *token_filter = (grn_proc *)token_filter_object; + + token_cursor->token_filter.data[i] = + token_filter->callbacks.token_filter.init(ctx, + token_cursor->table, + token_cursor->mode); + } +} + +grn_token_cursor * +grn_token_cursor_open(grn_ctx *ctx, grn_obj *table, + const char *str, size_t str_len, + grn_tokenize_mode mode, unsigned int flags) +{ + grn_token_cursor *token_cursor; + grn_encoding encoding; + grn_obj *tokenizer; + grn_obj *normalizer; + grn_obj *token_filters; + grn_table_flags table_flags; + if (grn_table_get_info(ctx, table, &table_flags, &encoding, &tokenizer, + &normalizer, &token_filters)) { + return NULL; + } + if (!(token_cursor = GRN_MALLOC(sizeof(grn_token_cursor)))) { return NULL; } + token_cursor->table = table; + token_cursor->mode = mode; + token_cursor->encoding = encoding; + token_cursor->tokenizer = tokenizer; + token_cursor->token_filter.objects = token_filters; + token_cursor->token_filter.data = NULL; + token_cursor->orig = (const unsigned char *)str; + token_cursor->orig_blen = str_len; + token_cursor->curr = NULL; + token_cursor->nstr = NULL; + token_cursor->curr_size = 0; + token_cursor->pos = -1; + token_cursor->status = GRN_TOKEN_CURSOR_DOING; + token_cursor->force_prefix = GRN_FALSE; + if (tokenizer) { + grn_obj str_, flags_, mode_; + GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY); + GRN_TEXT_SET_REF(&str_, str, str_len); + GRN_UINT32_INIT(&flags_, 0); + GRN_UINT32_SET(ctx, &flags_, flags); + GRN_UINT32_INIT(&mode_, 0); + GRN_UINT32_SET(ctx, &mode_, mode); + token_cursor->pctx.caller = NULL; + token_cursor->pctx.user_data.ptr = NULL; + token_cursor->pctx.proc = (grn_proc *)tokenizer; + token_cursor->pctx.hooks = NULL; + token_cursor->pctx.currh = NULL; + token_cursor->pctx.phase = PROC_INIT; + grn_ctx_push(ctx, &mode_); + grn_ctx_push(ctx, &str_); + grn_ctx_push(ctx, &flags_); + ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data); + grn_obj_close(ctx, &flags_); + grn_obj_close(ctx, &str_); + grn_obj_close(ctx, &mode_); + } else { + int nflags = 0; + token_cursor->nstr = grn_string_open_(ctx, str, str_len, + normalizer, + nflags, + token_cursor->encoding); + if (token_cursor->nstr) { + const char *normalized; + grn_string_get_normalized(ctx, token_cursor->nstr, + &normalized, &(token_cursor->curr_size), NULL); + token_cursor->curr = (const unsigned char *)normalized; + } else { + ERR(GRN_TOKENIZER_ERROR, + "[token-cursor][open] failed to grn_string_open()"); + } + } + + if (ctx->rc == GRN_SUCCESS) { + grn_token_cursor_open_initialize_token_filters(ctx, token_cursor); + } + + if (ctx->rc) { + grn_token_cursor_close(ctx, token_cursor); + token_cursor = NULL; + } + return token_cursor; +} + +static int +grn_token_cursor_next_apply_token_filters(grn_ctx *ctx, + grn_token_cursor *token_cursor, + grn_obj *current_token_data, + grn_obj *status) +{ + grn_obj *token_filters = token_cursor->token_filter.objects; + unsigned int i, n_token_filters; + grn_token current_token; + grn_token next_token; + + if (token_filters) { + n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); + } else { + n_token_filters = 0; + } + + GRN_TEXT_INIT(&(current_token.data), GRN_OBJ_DO_SHALLOW_COPY); + GRN_TEXT_SET(ctx, &(current_token.data), + GRN_TEXT_VALUE(current_token_data), + GRN_TEXT_LEN(current_token_data)); + current_token.status = GRN_INT32_VALUE(status); + GRN_TEXT_INIT(&(next_token.data), GRN_OBJ_DO_SHALLOW_COPY); + GRN_TEXT_SET(ctx, &(next_token.data), + GRN_TEXT_VALUE(&(current_token.data)), + GRN_TEXT_LEN(&(current_token.data))); + next_token.status = current_token.status; + + for (i = 0; i < n_token_filters; i++) { + grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); + grn_proc *token_filter = (grn_proc *)token_filter_object; + void *data = token_cursor->token_filter.data[i]; + +#define SKIP_FLAGS\ + (GRN_TOKEN_SKIP |\ + GRN_TOKEN_SKIP_WITH_POSITION) + if (current_token.status & SKIP_FLAGS) { + break; + } +#undef SKIP_FLAGS + + token_filter->callbacks.token_filter.filter(ctx, + ¤t_token, + &next_token, + data); + GRN_TEXT_SET(ctx, &(current_token.data), + GRN_TEXT_VALUE(&(next_token.data)), + GRN_TEXT_LEN(&(next_token.data))); + current_token.status = next_token.status; + } + + token_cursor->curr = + (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data)); + token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data)); + + return current_token.status; +} + +grn_id +grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor) +{ + int status; + grn_id tid = GRN_ID_NIL; + grn_obj *table = token_cursor->table; + grn_obj *tokenizer = token_cursor->tokenizer; + while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { + if (tokenizer) { + grn_obj *curr_, *stat_; + ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data); + stat_ = grn_ctx_pop(ctx); + curr_ = grn_ctx_pop(ctx); + status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor, + curr_, stat_); + token_cursor->status = + ((status & GRN_TOKEN_LAST) || + (token_cursor->mode == GRN_TOKENIZE_GET && + (status & GRN_TOKEN_REACH_END))) + ? GRN_TOKEN_CURSOR_DONE : GRN_TOKEN_CURSOR_DOING; + token_cursor->force_prefix = GRN_FALSE; +#define SKIP_FLAGS \ + (GRN_TOKEN_SKIP | GRN_TOKEN_SKIP_WITH_POSITION) + if (status & SKIP_FLAGS) { + if (status & GRN_TOKEN_SKIP) { + token_cursor->pos++; + } + if (token_cursor->status == GRN_TOKEN_CURSOR_DONE && tid == GRN_ID_NIL) { + token_cursor->status = GRN_TOKEN_CURSOR_DONE_SKIP; + break; + } else { + continue; + } + } +#undef SKIP_FLAGS + if (status & GRN_TOKEN_FORCE_PREFIX) { + token_cursor->force_prefix = GRN_TRUE; + } + if (token_cursor->curr_size == 0) { + if (token_cursor->status != GRN_TOKEN_CURSOR_DONE) { + char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; + int tokenizer_name_length; + tokenizer_name_length = + grn_obj_name(ctx, token_cursor->tokenizer, + tokenizer_name, GRN_TABLE_MAX_KEY_SIZE); + GRN_LOG(ctx, GRN_WARN, + "[token_next] ignore an empty token: <%.*s>: <%.*s>", + tokenizer_name_length, tokenizer_name, + token_cursor->orig_blen, token_cursor->orig); + } + continue; + } + if (token_cursor->curr_size > GRN_TABLE_MAX_KEY_SIZE) { + GRN_LOG(ctx, GRN_WARN, + "[token_next] ignore too long token. " + "Token must be less than or equal to %d: <%d>(<%.*s>)", + GRN_TABLE_MAX_KEY_SIZE, + token_cursor->curr_size, + token_cursor->curr_size, token_cursor->curr); + continue; + } + if (status & GRN_TOKEN_UNMATURED) { + if (status & GRN_TOKEN_OVERLAP) { + if (token_cursor->mode == GRN_TOKENIZE_GET) { + token_cursor->pos++; + continue; + } + } else { + if (status & GRN_TOKEN_REACH_END) { + token_cursor->force_prefix = GRN_TRUE; + } + } + } + } else { + token_cursor->status = GRN_TOKEN_CURSOR_DONE; + } + if (token_cursor->mode == GRN_TOKENIZE_ADD) { + switch (table->header.type) { + case GRN_TABLE_PAT_KEY : + if (grn_io_lock(ctx, ((grn_pat *)table)->io, grn_lock_timeout)) { + tid = GRN_ID_NIL; + } else { + tid = grn_pat_add(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, + NULL, NULL); + grn_io_unlock(((grn_pat *)table)->io); + } + break; + case GRN_TABLE_DAT_KEY : + if (grn_io_lock(ctx, ((grn_dat *)table)->io, grn_lock_timeout)) { + tid = GRN_ID_NIL; + } else { + tid = grn_dat_add(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, + NULL, NULL); + grn_io_unlock(((grn_dat *)table)->io); + } + break; + case GRN_TABLE_HASH_KEY : + if (grn_io_lock(ctx, ((grn_hash *)table)->io, grn_lock_timeout)) { + tid = GRN_ID_NIL; + } else { + tid = grn_hash_add(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, + NULL, NULL); + grn_io_unlock(((grn_hash *)table)->io); + } + break; + case GRN_TABLE_NO_KEY : + if (token_cursor->curr_size == sizeof(grn_id)) { + tid = *((grn_id *)token_cursor->curr); + } else { + tid = GRN_ID_NIL; + } + break; + } + } else if (token_cursor->mode != GRN_TOKENIZE_ONLY) { + switch (table->header.type) { + case GRN_TABLE_PAT_KEY : + tid = grn_pat_get(ctx, (grn_pat *)table, token_cursor->curr, token_cursor->curr_size, NULL); + break; + case GRN_TABLE_DAT_KEY : + tid = grn_dat_get(ctx, (grn_dat *)table, token_cursor->curr, token_cursor->curr_size, NULL); + break; + case GRN_TABLE_HASH_KEY : + tid = grn_hash_get(ctx, (grn_hash *)table, token_cursor->curr, token_cursor->curr_size, NULL); + break; + case GRN_TABLE_NO_KEY : + if (token_cursor->curr_size == sizeof(grn_id)) { + tid = *((grn_id *)token_cursor->curr); + } else { + tid = GRN_ID_NIL; + } + break; + } + } + if (token_cursor->mode != GRN_TOKENIZE_ONLY && + tid == GRN_ID_NIL && token_cursor->status != GRN_TOKEN_CURSOR_DONE) { + token_cursor->status = GRN_TOKEN_CURSOR_NOT_FOUND; + } + token_cursor->pos++; + break; + } + return tid; +} + +static void +grn_token_cursor_close_token_filters(grn_ctx *ctx, + grn_token_cursor *token_cursor) +{ + grn_obj *token_filters = token_cursor->token_filter.objects; + unsigned int i, n_token_filters; + + if (!token_cursor->token_filter.data) { + return; + } + + if (token_filters) { + n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *); + } else { + n_token_filters = 0; + } + + if (n_token_filters == 0) { + return; + } + + for (i = 0; i < n_token_filters; i++) { + grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i); + grn_proc *token_filter = (grn_proc *)token_filter_object; + void *data = token_cursor->token_filter.data[i]; + + token_filter->callbacks.token_filter.fin(ctx, data); + } + GRN_FREE(token_cursor->token_filter.data); +} + +grn_rc +grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor) +{ + if (token_cursor) { + if (token_cursor->tokenizer) { + ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table, + &token_cursor->pctx.user_data); + } + grn_token_cursor_close_token_filters(ctx, token_cursor); + if (token_cursor->nstr) { + grn_obj_close(ctx, token_cursor->nstr); + } + GRN_FREE(token_cursor); + return GRN_SUCCESS; + } else { + return GRN_INVALID_ARGUMENT; + } +} |