/* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2012-2014 Brazil This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ #include "grn.h" #include #include #include "grn_ctx.h" #include "grn_db.h" #include "grn_str.h" #include "grn_string.h" #include "grn_token_cursor.h" /* Just for backward compatibility. See grn_plugin_charlen() instead. */ int grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { return grn_plugin_charlen(ctx, str_ptr, str_length, encoding); } /* Just for backward compatibility. See grn_plugin_isspace() instead. */ int grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { return grn_plugin_isspace(ctx, str_ptr, str_length, encoding); } grn_bool grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { if (encoding != GRN_ENC_UTF8) { return GRN_FALSE; } if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) { return GRN_FALSE; } return memcmp(str_ptr, GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8, GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0; } grn_bool grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { int char_length; const char *current = str_ptr; const char *end = str_ptr + str_length; if (encoding != GRN_ENC_UTF8) { return GRN_FALSE; } if (str_length == 0) { return GRN_FALSE; } while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) { if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { return GRN_TRUE; } current += char_length; } return GRN_FALSE; } grn_tokenizer_query * grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args, unsigned int normalize_flags) { grn_obj *flags = grn_ctx_pop(ctx); grn_obj *query_str = grn_ctx_pop(ctx); grn_obj *tokenize_mode = grn_ctx_pop(ctx); if (query_str == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument"); return NULL; } if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer"); return NULL; } { grn_tokenizer_query * const query = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query)); if (query == NULL) { return NULL; } query->normalized_query = NULL; query->query_buf = NULL; if (flags) { query->flags = GRN_UINT32_VALUE(flags); } else { query->flags = 0; } if (tokenize_mode) { query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode); } else { query->tokenize_mode = GRN_TOKENIZE_ADD; } query->token_mode = query->tokenize_mode; { grn_obj * const table = args[0]; grn_table_flags table_flags; grn_encoding table_encoding; unsigned int query_length = GRN_TEXT_LEN(query_str); char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1); grn_obj *normalizer = NULL; if (query_buf == NULL) { GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to duplicate query"); return NULL; } grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL, &normalizer, NULL); { grn_obj *normalized_query; if (table_flags & GRN_OBJ_KEY_NORMALIZE) { normalizer = GRN_NORMALIZER_AUTO; } normalized_query = grn_string_open_(ctx, GRN_TEXT_VALUE(query_str), GRN_TEXT_LEN(query_str), normalizer, normalize_flags, table_encoding); if (!normalized_query) { GRN_PLUGIN_FREE(ctx, query_buf); GRN_PLUGIN_FREE(ctx, query); GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "[tokenizer] failed to open normalized string"); return NULL; } query->normalized_query = normalized_query; grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length); query_buf[query_length] = '\0'; query->query_buf = query_buf; query->ptr = query_buf; query->length = query_length; } query->encoding = table_encoding; if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) { const char *normalized_string; unsigned int normalized_string_length; grn_string_get_normalized(ctx, query->normalized_query, &normalized_string, &normalized_string_length, NULL); query->have_tokenized_delimiter = grn_tokenizer_have_tokenized_delimiter(ctx, normalized_string, normalized_string_length, query->encoding); } else { query->have_tokenized_delimiter = GRN_FALSE; } } return query; } } grn_tokenizer_query * grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args) { return grn_tokenizer_query_open(ctx, num_args, args, 0); } void grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query) { if (query != NULL) { if (query->normalized_query != NULL) { grn_obj_unlink(ctx, query->normalized_query); } if (query->query_buf != NULL) { GRN_PLUGIN_FREE(ctx, query->query_buf); } GRN_PLUGIN_FREE(ctx, query); } } void grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query) { grn_tokenizer_query_close(ctx, query); } void grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token) { GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY); GRN_UINT32_INIT(&token->status, 0); } void grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token) { GRN_OBJ_FIN(ctx, &(token->str)); GRN_OBJ_FIN(ctx, &(token->status)); } void grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token, const char *str_ptr, unsigned int str_length, grn_token_status status) { GRN_TEXT_SET_REF(&token->str, str_ptr, str_length); GRN_UINT32_SET(ctx, &token->status, status); grn_ctx_push(ctx, &token->str); grn_ctx_push(ctx, &token->status); } const char * grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx, grn_tokenizer_token *token, const char *str_ptr, unsigned int str_length, grn_encoding encoding) { size_t char_length = 0; const char *start = str_ptr; const char *current; const char *end = str_ptr + str_length; const char *next_start = NULL; unsigned int token_length; grn_token_status status; for (current = start; current < end; current += char_length) { char_length = grn_charlen_(ctx, current, end, encoding); if (char_length == 0) { break; } if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length, encoding)) { next_start = str_ptr + (current - start + char_length); break; } } token_length = current - start; if (current == end) { status = GRN_TOKENIZER_LAST; } else { status = GRN_TOKENIZER_CONTINUE; } grn_tokenizer_token_push(ctx, token, start, token_length, status); return next_start; } grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr, unsigned int plugin_name_length, grn_proc_func *init, grn_proc_func *next, grn_proc_func *fin) { grn_expr_var vars[] = { { NULL, 0 }, { NULL, 0 }, { NULL, 0 } }; GRN_TEXT_INIT(&vars[0].value, 0); GRN_TEXT_INIT(&vars[1].value, 0); GRN_UINT32_INIT(&vars[2].value, 0); { /* grn_proc_create() registers a plugin to the database which is associated with `ctx'. A returned object must not be finalized here. */ grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr, plugin_name_length, GRN_PROC_TOKENIZER, init, next, fin, 3, vars); if (obj == NULL) { GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed"); return ctx->rc; } } return GRN_SUCCESS; } grn_obj * grn_token_get_data(grn_ctx *ctx, grn_token *token) { GRN_API_ENTER; if (!token) { ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); GRN_API_RETURN(NULL); } GRN_API_RETURN(&(token->data)); } grn_rc grn_token_set_data(grn_ctx *ctx, grn_token *token, const char *str_ptr, int str_length) { GRN_API_ENTER; if (!token) { ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); goto exit; } if (str_length == -1) { str_length = strlen(str_ptr); } GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length); exit: GRN_API_RETURN(ctx->rc); } grn_token_status grn_token_get_status(grn_ctx *ctx, grn_token *token) { GRN_API_ENTER; if (!token) { ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); GRN_API_RETURN(GRN_TOKEN_CONTINUE); } GRN_API_RETURN(token->status); } grn_rc grn_token_set_status(grn_ctx *ctx, grn_token *token, grn_token_status status) { GRN_API_ENTER; if (!token) { ERR(GRN_INVALID_ARGUMENT, "token must not be NULL"); goto exit; } token->status = status; exit: GRN_API_RETURN(ctx->rc); }