diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:07:14 +0000 |
commit | a175314c3e5827eb193872241446f2f8f5c9d33c (patch) | |
tree | cd3d60ca99ae00829c52a6ca79150a5b6e62528b /storage/innobase/fts/fts0ast.cc | |
parent | Initial commit. (diff) | |
download | mariadb-10.5-9e4947182e0b875da38088fdd168e775f473b8ad.tar.xz mariadb-10.5-9e4947182e0b875da38088fdd168e775f473b8ad.zip |
Adding upstream version 1:10.5.12.upstream/1%10.5.12upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/fts/fts0ast.cc')
-rw-r--r-- | storage/innobase/fts/fts0ast.cc | 815 |
1 files changed, 815 insertions, 0 deletions
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc new file mode 100644 index 00000000..bb42f7c9 --- /dev/null +++ b/storage/innobase/fts/fts0ast.cc @@ -0,0 +1,815 @@ +/***************************************************************************** + +Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0ast.cc +Full Text Search parser helper file. + +Created 2007/3/16 Sunny Bains. +***********************************************************************/ + +#include "row0sel.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0fts.h" + +/* The FTS ast visit pass. */ +enum fts_ast_visit_pass_t { + FTS_PASS_FIRST, /*!< First visit pass, + process operators excluding + FTS_EXIST and FTS_IGNORE */ + FTS_PASS_EXIST, /*!< Exist visit pass, + process operator FTS_EXIST */ + FTS_PASS_IGNORE /*!< Ignore visit pass, + process operator FTS_IGNORE */ +}; + +/******************************************************************//** +Create an empty fts_ast_node_t. +@return Create a new node */ +static +fts_ast_node_t* +fts_ast_node_create(void) +/*=====================*/ +{ + fts_ast_node_t* node; + + node = (fts_ast_node_t*) ut_zalloc_nokey(sizeof(*node)); + + return(node); +} + +/** Track node allocations, in case there is an error during parsing. */ +static +void +fts_ast_state_add_node( + fts_ast_state_t*state, /*!< in: ast instance */ + fts_ast_node_t* node) /*!< in: node to add to ast */ +{ + if (!state->list.head) { + ut_a(!state->list.tail); + + state->list.head = state->list.tail = node; + } else { + state->list.tail->next_alloc = node; + state->list.tail = node; + } +} + +/******************************************************************//** +Create a operator fts_ast_node_t. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_oper_t oper) /*!< in: ast operator */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_OPER; + node->oper = oper; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it +@return new node or a node list with tokenized words */ +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const fts_ast_string_t* ptr) /*!< in: ast term string */ +{ + fts_ast_state_t* state = static_cast<fts_ast_state_t*>(arg); + ulint len = ptr->len; + ulint cur_pos = 0; + fts_ast_node_t* node = NULL; + fts_ast_node_t* node_list = NULL; + fts_ast_node_t* first_node = NULL; + + /* Scan the incoming string and filter out any "non-word" characters */ + while (cur_pos < len) { + fts_string_t str; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + state->charset, + reinterpret_cast<const byte*>(ptr->str) + cur_pos, + reinterpret_cast<const byte*>(ptr->str) + len, &str); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + + if (str.f_n_char > 0) { + /* If the subsequent term (after the first one)'s size + is less than fts_min_token_size or the term is greater + than fts_max_token_size, we shall ignore that. This is + to make consistent with MyISAM behavior */ + if ((first_node && (str.f_n_char < fts_min_token_size)) + || str.f_n_char > fts_max_token_size) { + continue; + } + + node = fts_ast_node_create(); + + node->type = FTS_AST_TERM; + + node->term.ptr = fts_ast_string_create( + str.f_str, str.f_len); + + fts_ast_state_add_node( + static_cast<fts_ast_state_t*>(arg), node); + + if (first_node) { + /* There is more than one word, create + a list to organize them */ + if (!node_list) { + node_list = fts_ast_create_node_list( + static_cast<fts_ast_state_t*>( + arg), + first_node); + } + + fts_ast_add_node(node_list, node); + } else { + first_node = node; + } + } + } + + return((node_list != NULL) ? node_list : first_node); +} + +/******************************************************************//** +Create an AST term node, makes a copy of ptr for plugin parser +@return node */ +fts_ast_node_t* +fts_ast_create_node_term_for_parser( +/*================================*/ + void* arg, /*!< in: ast state */ + const char* ptr, /*!< in: term string */ + const ulint len) /*!< in: term string length */ +{ + fts_ast_node_t* node = NULL; + + /* '%' as first char is forbidden for LIKE in internal SQL parser; + '%' as last char is reserved for wildcard search;*/ + if (len == 0 || len > FTS_MAX_WORD_LEN + || ptr[0] == '%' || ptr[len - 1] == '%') { + return(NULL); + } + + node = fts_ast_node_create(); + + node->type = FTS_AST_TERM; + + node->term.ptr = fts_ast_string_create( + reinterpret_cast<const byte*>(ptr), len); + + fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const fts_ast_string_t* ptr) /*!< in: ast text string */ +{ + ulint len = ptr->len; + fts_ast_node_t* node = NULL; + + /* Once we come here, the string must have at least 2 quotes "" + around the query string, which could be empty. Also the query + string may contain 0x00 in it, we don't treat it as null-terminated. */ + ut_ad(len >= 2); + ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"'); + + if (len == 2) { + /* If the query string contains nothing except quotes, + it's obviously an invalid query. */ + return(NULL); + } + + node = fts_ast_node_create(); + + /*!< We ignore the actual quotes "" */ + len -= 2; + + node->type = FTS_AST_TEXT; + /*!< Skip copying the first quote */ + node->text.ptr = fts_ast_string_create( + reinterpret_cast<const byte*>(ptr->str + 1), len); + node->text.distance = ULINT_UNDEFINED; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Create an AST phrase list node for plugin parser +@return node */ +fts_ast_node_t* +fts_ast_create_node_phrase_list( +/*============================*/ + void* arg) /*!< in: ast state */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_PARSER_PHRASE_LIST; + + node->text.distance = ULINT_UNDEFINED; + node->list.head = node->list.tail = NULL; + + fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the expr and is responsible +for free'ing it. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_SUBEXP_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Free an expr list node elements. */ +static +void +fts_ast_free_list( +/*==============*/ + fts_ast_node_t* node) /*!< in: ast node to free */ +{ + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST + || node->type == FTS_AST_PARSER_PHRASE_LIST); + + for (node = node->list.head; + node != NULL; + node = fts_ast_free_node(node)) { + + /*!< No op */ + } +} + +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node) /*!< in: the node to free */ +{ + fts_ast_node_t* next_node; + + switch (node->type) { + case FTS_AST_TEXT: + if (node->text.ptr) { + fts_ast_string_free(node->text.ptr); + node->text.ptr = NULL; + } + break; + + case FTS_AST_TERM: + if (node->term.ptr) { + fts_ast_string_free(node->term.ptr); + node->term.ptr = NULL; + } + break; + + case FTS_AST_LIST: + case FTS_AST_SUBEXP_LIST: + case FTS_AST_PARSER_PHRASE_LIST: + fts_ast_free_list(node); + node->list.head = node->list.tail = NULL; + break; + + case FTS_AST_OPER: + break; + + default: + ut_error; + } + + /*!< Get next node before freeing the node itself */ + next_node = node->next; + + ut_free(node); + + return(next_node); +} + +/******************************************************************//** +This AST takes ownership of the expr and is responsible +for free'ing it. +@return in param "list" */ +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* node, /*!< in: list instance */ + fts_ast_node_t* elem) /*!< in: node to add to list */ +{ + if (!elem) { + return(NULL); + } + + ut_a(!elem->next); + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST + || node->type == FTS_AST_PARSER_PHRASE_LIST); + + if (!node->list.head) { + ut_a(!node->list.tail); + + node->list.head = node->list.tail = elem; + } else { + ut_a(node->list.tail); + + node->list.tail->next = elem; + node->list.tail = elem; + } + + return(node); +} + +/******************************************************************//** +Set the wildcard attribute of a term. */ +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node) /*!< in/out: set attribute of + a term node */ +{ + if (!node) { + return; + } + + /* If it's a node list, the wildcard should be set to the tail node*/ + if (node->type == FTS_AST_LIST) { + ut_ad(node->list.tail != NULL); + node = node->list.tail; + } + + ut_a(node->type == FTS_AST_TERM); + ut_a(!node->term.wildcard); + + node->term.wildcard = TRUE; +} + +/******************************************************************//** +Set the proximity attribute of a text node. */ +void +fts_ast_text_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance) /*!< in: the text proximity + distance */ +{ + if (node == NULL) { + return; + } + + ut_a(node->type == FTS_AST_TEXT); + ut_a(node->text.distance == ULINT_UNDEFINED); + + node->text.distance = distance; +} + +/******************************************************************//** +Free node and expr allocations. */ +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state) /*!< in: ast state to free */ +{ + fts_ast_node_t* node = state->list.head; + + /* Free the nodes that were allocated during parsing. */ + while (node) { + fts_ast_node_t* next = node->next_alloc; + + if (node->type == FTS_AST_TEXT && node->text.ptr) { + fts_ast_string_free(node->text.ptr); + node->text.ptr = NULL; + } else if (node->type == FTS_AST_TERM && node->term.ptr) { + fts_ast_string_free(node->term.ptr); + node->term.ptr = NULL; + } + + ut_free(node); + node = next; + } + + state->root = state->list.head = state->list.tail = NULL; +} + +/** Print the ast string +@param[in] str string to print */ +static +void +fts_ast_string_print( + const fts_ast_string_t* ast_str) +{ + for (ulint i = 0; i < ast_str->len; ++i) { + printf("%c", ast_str->str[i]); + } + + printf("\n"); +} + +/******************************************************************//** +Print an ast node recursively. */ +static +void +fts_ast_node_print_recursive( +/*=========================*/ + fts_ast_node_t* node, /*!< in: ast node to print */ + ulint level) /*!< in: recursive level */ +{ + /* Print alignment blank */ + for (ulint i = 0; i < level; i++) { + printf(" "); + } + + switch (node->type) { + case FTS_AST_TEXT: + printf("TEXT: "); + fts_ast_string_print(node->text.ptr); + break; + + case FTS_AST_TERM: + printf("TERM: "); + fts_ast_string_print(node->term.ptr); + break; + + case FTS_AST_LIST: + printf("LIST: \n"); + + for (node = node->list.head; node; node = node->next) { + fts_ast_node_print_recursive(node, level + 1); + } + break; + + case FTS_AST_SUBEXP_LIST: + printf("SUBEXP_LIST: \n"); + + for (node = node->list.head; node; node = node->next) { + fts_ast_node_print_recursive(node, level + 1); + } + break; + + case FTS_AST_OPER: + printf("OPER: %d\n", node->oper); + break; + + case FTS_AST_PARSER_PHRASE_LIST: + printf("PARSER_PHRASE_LIST: \n"); + + for (node = node->list.head; node; node = node->next) { + fts_ast_node_print_recursive(node, level + 1); + } + break; + + default: + ut_error; + } +} + +/******************************************************************//** +Print an ast node */ +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node) /*!< in: ast node to print */ +{ + fts_ast_node_print_recursive(node, 0); +} + +/** Check only union operation involved in the node +@param[in] node ast node to check +@return true if the node contains only union else false. */ +bool +fts_ast_node_check_union( + fts_ast_node_t* node) +{ + if (node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST) { + + for (node = node->list.head; node; node = node->next) { + if (!fts_ast_node_check_union(node)) { + return(false); + } + } + + } else if (node->type == FTS_AST_PARSER_PHRASE_LIST) { + /* Phrase search for plugin parser */ + return(false); + } else if (node->type == FTS_AST_OPER + && (node->oper == FTS_IGNORE + || node->oper == FTS_EXIST)) { + + return(false); + } else if (node->type == FTS_AST_TEXT) { + /* Distance or phrase search query. */ + return(false); + } + + return(true); +} + +/******************************************************************//** +Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE +nodes, which will be ignored in the first pass of each level, and visited in a +second and third pass after all other nodes in the same level are visited. +@return DB_SUCCESS if all went well */ +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: current root node */ + fts_ast_callback visitor, /*!< in: callback function */ + void* arg, /*!< in: arg for callback */ + bool* has_ignore) /*!< out: true, if the operator + was ignored during processing, + currently we ignore FTS_EXIST + and FTS_IGNORE operators */ +{ + dberr_t error = DB_SUCCESS; + fts_ast_node_t* oper_node = NULL; + fts_ast_node_t* start_node; + bool revisit = false; + bool will_be_ignored = false; + fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST; + const trx_t* trx = node->trx; + + start_node = node->list.head; + + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + if (oper == FTS_EXIST_SKIP) { + visit_pass = FTS_PASS_EXIST; + } else if (oper == FTS_IGNORE_SKIP) { + visit_pass = FTS_PASS_IGNORE; + } + + /* In the first pass of the tree, at the leaf level of the + tree, FTS_EXIST and FTS_IGNORE operation will be ignored. + It will be repeated at the level above the leaf level. + + The basic idea here is that when we encounter FTS_EXIST or + FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP + or FTS_IGNORE_SKIP, and term node & text node with the operators + is ignored in the first pass. We have two passes during the revisit: + We process nodes with FTS_EXIST_SKIP in the exist pass, and then + process nodes with FTS_IGNORE_SKIP in the ignore pass. + + The order should be restrictly followed, or we will get wrong results. + For example, we have a query 'a +b -c d +e -f'. + first pass: process 'a' and 'd' by union; + exist pass: process '+b' and '+e' by intersection; + ignore pass: process '-c' and '-f' by difference. */ + + for (node = node->list.head; + node && (error == DB_SUCCESS); + node = node->next) { + + switch (node->type) { + case FTS_AST_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; + } + + error = fts_ast_visit(oper, node, visitor, + arg, &will_be_ignored); + + /* If will_be_ignored is set to true, then + we encountered and ignored a FTS_EXIST or FTS_IGNORE + operator. */ + if (will_be_ignored) { + revisit = true; + /* Remember oper for list in case '-abc&def', + ignored oper is from previous node of list.*/ + node->oper = oper; + } + + break; + + case FTS_AST_OPER: + oper = node->oper; + oper_node = node; + + /* Change the operator for revisit */ + if (oper == FTS_EXIST) { + oper_node->oper = FTS_EXIST_SKIP; + } else if (oper == FTS_IGNORE) { + oper_node->oper = FTS_IGNORE_SKIP; + } + + break; + + default: + if (node->visited) { + continue; + } + + ut_a(oper == FTS_NONE || !oper_node + || oper_node->oper == oper + || oper_node->oper == FTS_EXIST_SKIP + || oper_node->oper == FTS_IGNORE_SKIP); + + if (oper== FTS_EXIST || oper == FTS_IGNORE) { + *has_ignore = true; + continue; + } + + /* Process leaf node accroding to its pass.*/ + if (oper == FTS_EXIST_SKIP + && visit_pass == FTS_PASS_EXIST) { + error = visitor(FTS_EXIST, node, arg); + node->visited = true; + } else if (oper == FTS_IGNORE_SKIP + && visit_pass == FTS_PASS_IGNORE) { + error = visitor(FTS_IGNORE, node, arg); + node->visited = true; + } else if (visit_pass == FTS_PASS_FIRST) { + error = visitor(oper, node, arg); + node->visited = true; + } + } + } + + if (trx_is_interrupted(trx)) { + return DB_INTERRUPTED; + } + + if (revisit) { + /* Exist pass processes the skipped FTS_EXIST operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST + && node->oper != FTS_IGNORE) { + error = fts_ast_visit(FTS_EXIST_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + + /* Ignore pass processes the skipped FTS_IGNORE operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST) { + error = fts_ast_visit(FTS_IGNORE_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + } + + return(error); +} + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len) +{ + fts_ast_string_t* ast_str; + + ut_ad(len > 0); + + ast_str = static_cast<fts_ast_string_t*>( + ut_malloc_nokey(sizeof(fts_ast_string_t))); + + ast_str->str = static_cast<byte*>(ut_malloc_nokey(len + 1)); + + ast_str->len = len; + memcpy(ast_str->str, str, len); + ast_str->str[len] = '\0'; + + return(ast_str); +} + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +void +fts_ast_string_free( + fts_ast_string_t* ast_str) +{ + if (ast_str != NULL) { + ut_free(ast_str->str); + ut_free(ast_str); + } +} + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base) +{ + return(strtoul(reinterpret_cast<const char*>(ast_str->str), + NULL, base)); +} + +#ifdef UNIV_DEBUG +const char* +fts_ast_node_type_get(fts_ast_type_t type) +{ + switch (type) { + case FTS_AST_OPER: + return("FTS_AST_OPER"); + case FTS_AST_NUMB: + return("FTS_AST_NUMB"); + case FTS_AST_TERM: + return("FTS_AST_TERM"); + case FTS_AST_TEXT: + return("FTS_AST_TEXT"); + case FTS_AST_LIST: + return("FTS_AST_LIST"); + case FTS_AST_SUBEXP_LIST: + return("FTS_AST_SUBEXP_LIST"); + case FTS_AST_PARSER_PHRASE_LIST: + return("FTS_AST_PARSER_PHRASE_LIST"); + } + ut_ad(0); + return("FTS_UNKNOWN"); +} +#endif /* UNIV_DEBUG */ |