summaryrefslogtreecommitdiffstats
path: root/storage/mroonga/vendor/groonga/lib/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/mroonga/vendor/groonga/lib/tokenizer.c')
-rw-r--r--storage/mroonga/vendor/groonga/lib/tokenizer.c375
1 files changed, 375 insertions, 0 deletions
diff --git a/storage/mroonga/vendor/groonga/lib/tokenizer.c b/storage/mroonga/vendor/groonga/lib/tokenizer.c
new file mode 100644
index 00000000..faf47fd6
--- /dev/null
+++ b/storage/mroonga/vendor/groonga/lib/tokenizer.c
@@ -0,0 +1,375 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+ Copyright(C) 2012-2014 Brazil
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License version 2.1 as published by the Free Software Foundation.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*/
+#include "grn.h"
+#include <groonga/tokenizer.h>
+
+#include <string.h>
+
+#include "grn_ctx.h"
+#include "grn_db.h"
+#include "grn_str.h"
+#include "grn_string.h"
+#include "grn_token_cursor.h"
+
+/*
+ Just for backward compatibility. See grn_plugin_charlen() instead.
+ */
+int
+grn_tokenizer_charlen(grn_ctx *ctx, const char *str_ptr,
+ unsigned int str_length, grn_encoding encoding)
+{
+ return grn_plugin_charlen(ctx, str_ptr, str_length, encoding);
+}
+
+/*
+ Just for backward compatibility. See grn_plugin_isspace() instead.
+ */
+int
+grn_tokenizer_isspace(grn_ctx *ctx, const char *str_ptr,
+ unsigned int str_length, grn_encoding encoding)
+{
+ return grn_plugin_isspace(ctx, str_ptr, str_length, encoding);
+}
+
+grn_bool
+grn_tokenizer_is_tokenized_delimiter(grn_ctx *ctx,
+ const char *str_ptr,
+ unsigned int str_length,
+ grn_encoding encoding)
+{
+ if (encoding != GRN_ENC_UTF8) {
+ return GRN_FALSE;
+ }
+
+ if (str_length != GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) {
+ return GRN_FALSE;
+ }
+
+ return memcmp(str_ptr,
+ GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8,
+ GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN) == 0;
+}
+
+grn_bool
+grn_tokenizer_have_tokenized_delimiter(grn_ctx *ctx,
+ const char *str_ptr,
+ unsigned int str_length,
+ grn_encoding encoding)
+{
+ int char_length;
+ const char *current = str_ptr;
+ const char *end = str_ptr + str_length;
+
+ if (encoding != GRN_ENC_UTF8) {
+ return GRN_FALSE;
+ }
+
+ if (str_length == 0) {
+ return GRN_FALSE;
+ }
+
+ while ((char_length = grn_charlen_(ctx, current, end, encoding)) > 0) {
+ if (grn_tokenizer_is_tokenized_delimiter(ctx,
+ current, char_length,
+ encoding)) {
+ return GRN_TRUE;
+ }
+ current += char_length;
+ }
+ return GRN_FALSE;
+}
+
+grn_tokenizer_query *
+grn_tokenizer_query_open(grn_ctx *ctx, int num_args, grn_obj **args,
+ unsigned int normalize_flags)
+{
+ grn_obj *flags = grn_ctx_pop(ctx);
+ grn_obj *query_str = grn_ctx_pop(ctx);
+ grn_obj *tokenize_mode = grn_ctx_pop(ctx);
+
+ if (query_str == NULL) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "missing argument");
+ return NULL;
+ }
+
+ if ((num_args < 1) || (args == NULL) || (args[0] == NULL)) {
+ GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "invalid NULL pointer");
+ return NULL;
+ }
+
+ {
+ grn_tokenizer_query * const query =
+ GRN_PLUGIN_MALLOC(ctx, sizeof(grn_tokenizer_query));
+ if (query == NULL) {
+ return NULL;
+ }
+ query->normalized_query = NULL;
+ query->query_buf = NULL;
+ if (flags) {
+ query->flags = GRN_UINT32_VALUE(flags);
+ } else {
+ query->flags = 0;
+ }
+ if (tokenize_mode) {
+ query->tokenize_mode = GRN_UINT32_VALUE(tokenize_mode);
+ } else {
+ query->tokenize_mode = GRN_TOKENIZE_ADD;
+ }
+ query->token_mode = query->tokenize_mode;
+
+ {
+ grn_obj * const table = args[0];
+ grn_table_flags table_flags;
+ grn_encoding table_encoding;
+ unsigned int query_length = GRN_TEXT_LEN(query_str);
+ char *query_buf = (char *)GRN_PLUGIN_MALLOC(ctx, query_length + 1);
+ grn_obj *normalizer = NULL;
+
+ if (query_buf == NULL) {
+ GRN_PLUGIN_FREE(ctx, query);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer] failed to duplicate query");
+ return NULL;
+ }
+ grn_table_get_info(ctx, table, &table_flags, &table_encoding, NULL,
+ &normalizer, NULL);
+ {
+ grn_obj *normalized_query;
+ if (table_flags & GRN_OBJ_KEY_NORMALIZE) {
+ normalizer = GRN_NORMALIZER_AUTO;
+ }
+ normalized_query = grn_string_open_(ctx,
+ GRN_TEXT_VALUE(query_str),
+ GRN_TEXT_LEN(query_str),
+ normalizer,
+ normalize_flags,
+ table_encoding);
+ if (!normalized_query) {
+ GRN_PLUGIN_FREE(ctx, query_buf);
+ GRN_PLUGIN_FREE(ctx, query);
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR,
+ "[tokenizer] failed to open normalized string");
+ return NULL;
+ }
+ query->normalized_query = normalized_query;
+ grn_memcpy(query_buf, GRN_TEXT_VALUE(query_str), query_length);
+ query_buf[query_length] = '\0';
+ query->query_buf = query_buf;
+ query->ptr = query_buf;
+ query->length = query_length;
+ }
+ query->encoding = table_encoding;
+
+ if (query->flags & GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER) {
+ const char *normalized_string;
+ unsigned int normalized_string_length;
+
+ grn_string_get_normalized(ctx,
+ query->normalized_query,
+ &normalized_string,
+ &normalized_string_length,
+ NULL);
+ query->have_tokenized_delimiter =
+ grn_tokenizer_have_tokenized_delimiter(ctx,
+ normalized_string,
+ normalized_string_length,
+ query->encoding);
+ } else {
+ query->have_tokenized_delimiter = GRN_FALSE;
+ }
+ }
+ return query;
+ }
+}
+
+grn_tokenizer_query *
+grn_tokenizer_query_create(grn_ctx *ctx, int num_args, grn_obj **args)
+{
+ return grn_tokenizer_query_open(ctx, num_args, args, 0);
+}
+
+void
+grn_tokenizer_query_close(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+ if (query != NULL) {
+ if (query->normalized_query != NULL) {
+ grn_obj_unlink(ctx, query->normalized_query);
+ }
+ if (query->query_buf != NULL) {
+ GRN_PLUGIN_FREE(ctx, query->query_buf);
+ }
+ GRN_PLUGIN_FREE(ctx, query);
+ }
+}
+
+void
+grn_tokenizer_query_destroy(grn_ctx *ctx, grn_tokenizer_query *query)
+{
+ grn_tokenizer_query_close(ctx, query);
+}
+
+void
+grn_tokenizer_token_init(grn_ctx *ctx, grn_tokenizer_token *token)
+{
+ GRN_TEXT_INIT(&token->str, GRN_OBJ_DO_SHALLOW_COPY);
+ GRN_UINT32_INIT(&token->status, 0);
+}
+
+void
+grn_tokenizer_token_fin(grn_ctx *ctx, grn_tokenizer_token *token)
+{
+ GRN_OBJ_FIN(ctx, &(token->str));
+ GRN_OBJ_FIN(ctx, &(token->status));
+}
+
+void
+grn_tokenizer_token_push(grn_ctx *ctx, grn_tokenizer_token *token,
+ const char *str_ptr, unsigned int str_length,
+ grn_token_status status)
+{
+ GRN_TEXT_SET_REF(&token->str, str_ptr, str_length);
+ GRN_UINT32_SET(ctx, &token->status, status);
+ grn_ctx_push(ctx, &token->str);
+ grn_ctx_push(ctx, &token->status);
+}
+
+const char *
+grn_tokenizer_tokenized_delimiter_next(grn_ctx *ctx,
+ grn_tokenizer_token *token,
+ const char *str_ptr,
+ unsigned int str_length,
+ grn_encoding encoding)
+{
+ size_t char_length = 0;
+ const char *start = str_ptr;
+ const char *current;
+ const char *end = str_ptr + str_length;
+ const char *next_start = NULL;
+ unsigned int token_length;
+ grn_token_status status;
+
+ for (current = start; current < end; current += char_length) {
+ char_length = grn_charlen_(ctx, current, end, encoding);
+ if (char_length == 0) {
+ break;
+ }
+ if (grn_tokenizer_is_tokenized_delimiter(ctx, current, char_length,
+ encoding)) {
+ next_start = str_ptr + (current - start + char_length);
+ break;
+ }
+ }
+
+ token_length = current - start;
+ if (current == end) {
+ status = GRN_TOKENIZER_LAST;
+ } else {
+ status = GRN_TOKENIZER_CONTINUE;
+ }
+ grn_tokenizer_token_push(ctx, token, start, token_length, status);
+
+ return next_start;
+}
+
+grn_rc
+grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
+ unsigned int plugin_name_length,
+ grn_proc_func *init, grn_proc_func *next,
+ grn_proc_func *fin)
+{
+ grn_expr_var vars[] = {
+ { NULL, 0 },
+ { NULL, 0 },
+ { NULL, 0 }
+ };
+ GRN_TEXT_INIT(&vars[0].value, 0);
+ GRN_TEXT_INIT(&vars[1].value, 0);
+ GRN_UINT32_INIT(&vars[2].value, 0);
+
+ {
+ /*
+ grn_proc_create() registers a plugin to the database which is associated
+ with `ctx'. A returned object must not be finalized here.
+ */
+ grn_obj * const obj = grn_proc_create(ctx, plugin_name_ptr,
+ plugin_name_length,
+ GRN_PROC_TOKENIZER,
+ init, next, fin, 3, vars);
+ if (obj == NULL) {
+ GRN_PLUGIN_ERROR(ctx, GRN_TOKENIZER_ERROR, "grn_proc_create() failed");
+ return ctx->rc;
+ }
+ }
+ return GRN_SUCCESS;
+}
+
+grn_obj *
+grn_token_get_data(grn_ctx *ctx, grn_token *token)
+{
+ GRN_API_ENTER;
+ if (!token) {
+ ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
+ GRN_API_RETURN(NULL);
+ }
+ GRN_API_RETURN(&(token->data));
+}
+
+grn_rc
+grn_token_set_data(grn_ctx *ctx,
+ grn_token *token,
+ const char *str_ptr,
+ int str_length)
+{
+ GRN_API_ENTER;
+ if (!token) {
+ ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
+ goto exit;
+ }
+ if (str_length == -1) {
+ str_length = strlen(str_ptr);
+ }
+ GRN_TEXT_SET(ctx, &(token->data), str_ptr, str_length);
+exit:
+ GRN_API_RETURN(ctx->rc);
+}
+
+grn_token_status
+grn_token_get_status(grn_ctx *ctx, grn_token *token)
+{
+ GRN_API_ENTER;
+ if (!token) {
+ ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
+ GRN_API_RETURN(GRN_TOKEN_CONTINUE);
+ }
+ GRN_API_RETURN(token->status);
+}
+
+grn_rc
+grn_token_set_status(grn_ctx *ctx,
+ grn_token *token,
+ grn_token_status status)
+{
+ GRN_API_ENTER;
+ if (!token) {
+ ERR(GRN_INVALID_ARGUMENT, "token must not be NULL");
+ goto exit;
+ }
+ token->status = status;
+exit:
+ GRN_API_RETURN(ctx->rc);
+}