diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/fts/fts0que.cc | |
parent | Initial commit. (diff) | |
download | mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.tar.xz mariadb-3f619478f796eddbba6e39502fe941b285dd97b1.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/fts/fts0que.cc')
-rw-r--r-- | storage/innobase/fts/fts0que.cc | 4612 |
1 files changed, 4612 insertions, 0 deletions
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc new file mode 100644 index 00000000..9c92a117 --- /dev/null +++ b/storage/innobase/fts/fts0que.cc @@ -0,0 +1,4612 @@ +/***************************************************************************** + +Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0que.cc +Full Text Search functionality. + +Created 2007/03/27 Sunny Bains +Completed 2011/7/10 Sunny and Jimmy Yang +*******************************************************/ + +#include "dict0dict.h" +#include "ut0rbt.h" +#include "row0sel.h" +#include "fts0fts.h" +#include "fts0priv.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0types.h" +#include "fts0plugin.h" +#include "fts0vlc.h" + +#include <iomanip> +#include <vector> + +#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)]) + +#define RANK_DOWNGRADE (-1.0F) +#define RANK_UPGRADE (1.0F) + +/* Maximum number of words supported in a phrase or proximity search. */ +#define MAX_PROXIMITY_ITEM 128 + +/* Memory used by rbt itself for create and node add */ +#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 +#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) + +/*Initial byte length for 'words' in fts_ranking_t */ +#define RANKING_WORDS_INIT_LEN 4 + +// FIXME: Need to have a generic iterator that traverses the ilist. + +typedef std::vector<fts_string_t, ut_allocator<fts_string_t> > word_vector_t; + +struct fts_word_freq_t; + +/** State of an FTS query. */ +struct fts_query_t { + mem_heap_t* heap; /*!< Heap to use for allocations */ + + trx_t* trx; /*!< The query transaction */ + + dict_index_t* index; /*!< The FTS index to search */ + /*!< FTS auxiliary common table def */ + + fts_table_t fts_common_table; + + fts_table_t fts_index_table;/*!< FTS auxiliary index table def */ + + size_t total_size; /*!< total memory size used by query */ + + fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be + filtered from the output */ + + fts_ast_node_t* root; /*!< Abstract syntax tree */ + + fts_ast_node_t* cur_node; /*!< Current tree node */ + + ib_rbt_t* word_map; /*!< Matched word map for + searching by word*/ + + word_vector_t* word_vector; /*!< Matched word vector for + searching by index */ + + ib_rbt_t* doc_ids; /*!< The current set of matching + doc ids, elements are of + type fts_ranking_t */ + + ib_rbt_t* intersection; /*!< The doc ids that were found in + doc_ids, this tree will become + the new doc_ids, elements are of type + fts_ranking_t */ + + /*!< Prepared statement to read the + nodes from the FTS INDEX */ + que_t* read_nodes_graph; + + fts_ast_oper_t oper; /*!< Current boolean mode operator */ + + /*!< TRUE if we want to collect the + word positions within the document */ + ibool collect_positions; + + ulint flags; /*!< Specify the full text search type, + such as boolean search, phrase + search, proximity search etc. */ + + ulint distance; /*!< The proximity distance of a + phrase search. */ + + /*!< These doc ids are used as a + boundary condition when searching the + FTS index rows */ + + doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */ + + doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */ + + bool boolean_mode; /*!< TRUE if boolean mode query */ + + ib_vector_t* matched; /*!< Array of matching documents + (fts_match_t) to search for a phrase */ + + ib_vector_t** match_array; /*!< Used for proximity search, contains + position info for each matched word + in the word list */ + + ib_uint64_t total_docs; /*!< The total number of documents */ + + ulint total_words; /*!< The total number of words */ + + dberr_t error; /*!< Error code if any, that is + encountered during query processing */ + + ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per + document, its elements are of type + fts_word_freq_t */ + + ib_rbt_t* wildcard_words; /*!< words with wildcard */ + + bool multi_exist; /*!< multiple FTS_EXIST oper */ + byte visiting_sub_exp; /*!< count of nested + fts_ast_visit_sub_exp() */ + + st_mysql_ftparser* parser; /*!< fts plugin parser */ +}; + +/** For phrase matching, first we collect the documents and the positions +then we match. */ +struct fts_match_t { + doc_id_t doc_id; /*!< Document id */ + + ulint start; /*!< Start the phrase match from + this offset within the positions + vector. */ + + ib_vector_t* positions; /*!< Offsets of a word in a + document */ +}; + +/** For matching tokens in a phrase search. We use this data structure in +the callback that determines whether a document should be accepted or +rejected for a phrase search. */ +struct fts_select_t { + doc_id_t doc_id; /*!< The document id to match */ + + ulint min_pos; /*!< For found to be TRUE at least + one position must be greater than + min_pos. */ + + ibool found; /*!< TRUE if found */ + + fts_word_freq_t* + word_freq; /*!< Word frequency instance of the + current word being looked up in + the FTS index */ +}; + +typedef std::vector<ulint, ut_allocator<ulint> > pos_vector_t; + +/** structure defines a set of ranges for original documents, each of which +has a minimum position and maximum position. Text in such range should +contain all words in the proximity search. We will need to count the +words in such range to make sure it is less than the specified distance +of the proximity search */ +struct fts_proximity_t { + ulint n_pos; /*!< number of position set, defines + a range (min to max) containing all + matching words */ + pos_vector_t min_pos; /*!< the minimum position (in bytes) + of the range */ + pos_vector_t max_pos; /*!< the maximum position (in bytes) + of the range */ +}; + +/** The match positions and tokesn to match */ +struct fts_phrase_t { + fts_phrase_t(const dict_table_t* table) + : + found(false), + match(NULL), + tokens(NULL), + distance(0), + charset(NULL), + heap(NULL), + zip_size(table->space->zip_size()), + proximity_pos(NULL), + parser(NULL) + { + } + + /** Match result */ + ibool found; + + /** Positions within text */ + const fts_match_t* match; + + /** Tokens to match */ + const ib_vector_t* tokens; + + /** For matching on proximity distance. Can be 0 for exact match */ + ulint distance; + + /** Phrase match charset */ + CHARSET_INFO* charset; + + /** Heap for word processing */ + mem_heap_t* heap; + + /** ROW_FORMAT=COMPRESSED page size, or 0 */ + const ulint zip_size; + + /** Position info for proximity search verification. Records the + min and max position of words matched */ + fts_proximity_t* proximity_pos; + + /** FTS plugin parser */ + st_mysql_ftparser* parser; +}; + +/** Paramter passed to fts phrase match by parser */ +struct fts_phrase_param_t { + fts_phrase_t* phrase; /*!< Match phrase instance */ + ulint token_index; /*!< Index of token to match next */ + mem_heap_t* heap; /*!< Heap for word processing */ +}; + +/** For storing the frequncy of a word/term in a document */ +struct fts_doc_freq_t { + doc_id_t doc_id; /*!< Document id */ + ulint freq; /*!< Frequency of a word in a document */ +}; + +/** To determine the word frequency per document. */ +struct fts_word_freq_t { + fts_string_t word; /*!< Word for which we need the freq, + it's allocated on the query heap */ + + ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document + word frequencies. The elements are + of type fts_doc_freq_t */ + ib_uint64_t doc_count; /*!< Total number of documents that + contain this word */ + double idf; /*!< Inverse document frequency */ +}; + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. +@return always TRUE */ +static +ibool +fts_query_index_fetch_nodes( +/*========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg); /*!< in: pointer to ib_vector_t */ + +/******************************************************************** +Read and filter nodes. +@return fts_node_t instance */ +static +dberr_t +fts_query_filter_doc_ids( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: the current word */ + fts_word_freq_t* word_freq, /*!< in/out: word frequency */ + const fts_node_t* node, /*!< in: current FTS node */ + void* data, /*!< in: doc id ilist */ + ulint len, /*!< in: doc id ilist size */ + ibool calc_doc_count);/*!< in: whether to remember doc + count */ + +/** Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@param[in,out] node current root node +@param[in,out] visitor callback function +@param[in,out] arg argument for callback +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_ast_visit_sub_exp( + fts_ast_node_t* node, + fts_ast_callback visitor, + void* arg); + +#if 0 +/*****************************************************************//*** +Find a doc_id in a word's ilist. +@return TRUE if found. */ +static +ibool +fts_query_find_doc_id( +/*==================*/ + fts_select_t* select, /*!< in/out: search the doc id selected, + update the frequency if found. */ + void* data, /*!< in: doc id ilist */ + ulint len); /*!< in: doc id ilist size */ +#endif + +/*************************************************************//** +This function implements a simple "blind" query expansion search: +words in documents found in the first search pass will be used as +search arguments to search the document again, thus "expand" +the search result set. +@return DB_SUCCESS if success, otherwise the error code */ +static +dberr_t +fts_expand_query( +/*=============*/ + dict_index_t* index, /*!< in: FTS index to search */ + fts_query_t* query) /*!< in: query result, to be freed + by the client */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +This function finds documents that contain all words in a +phrase or proximity search. And if proximity search, verify +the words are close enough to each other, as in specified distance. +This function is called for phrase and proximity search. +@return TRUE if documents are found, FALSE if otherwise */ +static +ibool +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance + query->doc_ids might be instantiated + with qualified doc IDs */ + ib_vector_t* tokens); /*!< in: Tokens contain words */ +/*************************************************************//** +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ +static +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos); /*!< out: the position info + records ranges containing + all matching words. */ +#if 0 +/******************************************************************** +Get the total number of words in a documents. */ +static +ulint +fts_query_terms_in_document( +/*========================*/ + /*!< out: DB_SUCCESS if all go well + else error code */ + fts_query_t* query, /*!< in: FTS query state */ + doc_id_t doc_id, /*!< in: the word to check */ + ulint* total); /*!< out: total words in document */ +#endif + +/******************************************************************** +Compare two fts_doc_freq_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_freq_doc_id_cmp( +/*================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1; + const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2; + + return((int) (fq1->doc_id - fq2->doc_id)); +} + +#if 0 +/*******************************************************************//** +Print the table used for calculating LCS. */ +static +void +fts_print_lcs_table( +/*================*/ + const ulint* table, /*!< in: array to print */ + ulint n_rows, /*!< in: total no. of rows */ + ulint n_cols) /*!< in: total no. of cols */ +{ + ulint i; + + for (i = 0; i < n_rows; ++i) { + ulint j; + + printf("\n"); + + for (j = 0; j < n_cols; ++j) { + + printf("%2lu ", FTS_ELEM(table, n_cols, i, j)); + } + } +} + +/******************************************************************** +Find the longest common subsequence between the query string and +the document. */ +static +ulint +fts_query_lcs( +/*==========*/ + /*!< out: LCS (length) between + two ilists */ + const ulint* p1, /*!< in: word positions of query */ + ulint len_p1, /*!< in: no. of elements in p1 */ + const ulint* p2, /*!< in: word positions within document */ + ulint len_p2) /*!< in: no. of elements in p2 */ +{ + int i; + ulint len = 0; + ulint r = len_p1; + ulint c = len_p2; + ulint size = (r + 1) * (c + 1) * sizeof(ulint); + ulint* table = (ulint*) ut_malloc_nokey(size); + + /* Traverse the table backwards, from the last row to the first and + also from the last column to the first. We compute the smaller + common subsequeces first, then use the caluclated values to determine + the longest common subsequence. The result will be in TABLE[0][0]. */ + for (i = r; i >= 0; --i) { + int j; + + for (j = c; j >= 0; --j) { + + if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) { + + FTS_ELEM(table, c, i, j) = 0; + + } else if (p1[i] == p2[j]) { + + FTS_ELEM(table, c, i, j) = FTS_ELEM( + table, c, i + 1, j + 1) + 1; + + } else { + + ulint value; + + value = ut_max( + FTS_ELEM(table, c, i + 1, j), + FTS_ELEM(table, c, i, j + 1)); + + FTS_ELEM(table, c, i, j) = value; + } + } + } + + len = FTS_ELEM(table, c, 0, 0); + + fts_print_lcs_table(table, r, c); + printf("\nLen=" ULINTPF "\n", len); + + ut_free(table); + + return(len); +} +#endif + +/*******************************************************************//** +Compare two fts_ranking_t instance on their rank value and doc ids in +descending order on the rank and ascending order on doc id. +@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ +static +int +fts_query_compare_rank( +/*===================*/ + const void* p1, /*!< in: pointer to elem */ + const void* p2) /*!< in: pointer to elem */ +{ + const fts_ranking_t* r1 = (const fts_ranking_t*) p1; + const fts_ranking_t* r2 = (const fts_ranking_t*) p2; + + if (r2->rank < r1->rank) { + return(-1); + } else if (r2->rank == r1->rank) { + + if (r1->doc_id < r2->doc_id) { + return(1); + } else if (r1->doc_id > r2->doc_id) { + return(1); + } + + return(0); + } + + return(1); +} + +/*******************************************************************//** +Create words in ranking */ +static +void +fts_ranking_words_create( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking) /*!< in: ranking instance */ +{ + ranking->words = static_cast<byte*>( + mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN)); + ranking->words_len = RANKING_WORDS_INIT_LEN; +} + +/* +The optimization here is using a char array(bitmap) to replace words rb tree +in fts_ranking_t. + +It can save lots of memory except in some cases of QUERY EXPANSION. + +'word_map' is used as a word dictionary, in which the key is a word, the value +is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'. +if not, we add it into 'word_map', and give it a position(actually a number). +then we set the corresponding bit to '1' at the position in the char array 'words'. + +'word_vector' is a useful backup of 'word_map', and we can get a word by its position, +more quickly than searching by value in 'word_map'. we use 'word_vector' +in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need +to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq +by the word. +*/ + +/*******************************************************************//** +Add a word into ranking */ +static +void +fts_ranking_words_add( +/*==================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking, /*!< in: ranking instance */ + const fts_string_t* word) /*!< in: term/word to add */ +{ + ulint pos; + ulint byte_offset; + ulint bit_offset; + ib_rbt_bound_t parent; + + /* Note: we suppose the word map and vector are append-only. */ + ut_ad(query->word_vector->size() == rbt_size(query->word_map)); + + /* We use ib_rbt to simulate a map, f_n_char means position. */ + if (rbt_search(query->word_map, &parent, word) == 0) { + fts_string_t* result_word; + + result_word = rbt_value(fts_string_t, parent.last); + pos = result_word->f_n_char; + ut_ad(pos < rbt_size(query->word_map)); + } else { + /* Add the word to map. */ + fts_string_t new_word; + + pos = rbt_size(query->word_map); + + fts_string_dup(&new_word, word, query->heap); + new_word.f_n_char = pos; + + rbt_add_node(query->word_map, &parent, &new_word); + ut_ad(rbt_validate(query->word_map)); + query->word_vector->push_back(new_word); + } + + /* Check words len */ + byte_offset = pos / CHAR_BIT; + if (byte_offset >= ranking->words_len) { + byte* words = ranking->words; + ulint words_len = ranking->words_len; + + while (byte_offset >= words_len) { + words_len *= 2; + } + + ranking->words = static_cast<byte*>( + mem_heap_zalloc(query->heap, words_len)); + memcpy(ranking->words, words, ranking->words_len); + ranking->words_len = words_len; + } + + /* Set ranking words */ + ut_ad(byte_offset < ranking->words_len); + bit_offset = pos % CHAR_BIT; + ranking->words[byte_offset] = static_cast<byte>( + ranking->words[byte_offset] | 1 << bit_offset); +} + +/*******************************************************************//** +Get a word from a ranking +@return true if it's successful */ +static +bool +fts_ranking_words_get_next( +/*=======================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking,/*!< in: ranking instance */ + ulint* pos, /*!< in/out: word start pos */ + fts_string_t* word) /*!< in/out: term/word to add */ +{ + bool ret = false; + ulint max_pos = ranking->words_len * CHAR_BIT; + + /* Search for next word */ + while (*pos < max_pos) { + ulint byte_offset = *pos / CHAR_BIT; + ulint bit_offset = *pos % CHAR_BIT; + + if (ranking->words[byte_offset] & (1 << bit_offset)) { + ret = true; + break; + } + + *pos += 1; + }; + + /* Get next word from word vector */ + if (ret) { + ut_ad(*pos < query->word_vector->size()); + *word = query->word_vector->at((size_t)*pos); + *pos += 1; + } + + return ret; +} + +/*******************************************************************//** +Add a word if it doesn't exist, to the term freq RB tree. We store +a pointer to the word that is passed in as the argument. +@return pointer to word */ +static +fts_word_freq_t* +fts_query_add_word_freq( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word) /*!< in: term/word to add */ +{ + ib_rbt_bound_t parent; + + /* Lookup the word in our rb tree and add if it doesn't exist. */ + if (rbt_search(query->word_freqs, &parent, word) != 0) { + fts_word_freq_t word_freq; + + memset(&word_freq, 0, sizeof(word_freq)); + + fts_string_dup(&word_freq.word, word, query->heap); + + word_freq.doc_count = 0; + + word_freq.doc_freqs = rbt_create( + sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp); + + parent.last = rbt_add_node( + query->word_freqs, &parent, &word_freq); + + query->total_size += word->f_len + + SIZEOF_RBT_CREATE + + SIZEOF_RBT_NODE_ADD + + sizeof(fts_word_freq_t); + } + + return(rbt_value(fts_word_freq_t, parent.last)); +} + +/*******************************************************************//** +Add a doc id if it doesn't exist, to the doc freq RB tree. +@return pointer to word */ +static +fts_doc_freq_t* +fts_query_add_doc_freq( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */ + doc_id_t doc_id) /*!< in: doc id to add */ +{ + ib_rbt_bound_t parent; + + /* Lookup the doc id in our rb tree and add if it doesn't exist. */ + if (rbt_search(doc_freqs, &parent, &doc_id) != 0) { + fts_doc_freq_t doc_freq; + + memset(&doc_freq, 0, sizeof(doc_freq)); + + doc_freq.freq = 0; + doc_freq.doc_id = doc_id; + + parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_doc_freq_t); + } + + return(rbt_value(fts_doc_freq_t, parent.last)); +} + +/*******************************************************************//** +Add the doc id to the query set only if it's not in the +deleted array. */ +static +void +fts_query_union_doc_id( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's not already in our set. */ + if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + + fts_ranking_t ranking; + + ranking.rank = rank; + ranking.doc_id = doc_id; + fts_ranking_words_create(query, &ranking); + + rbt_add_node(query->doc_ids, &parent, &ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN; + } +} + +/*******************************************************************//** +Remove the doc id from the query set only if it's not in the +deleted set. */ +static +void +fts_query_remove_doc_id( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id) /*!< in: the doc id to add */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's in our set. */ + if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + ut_free(rbt_remove_node(query->doc_ids, parent.last)); + + ut_ad(query->total_size >= + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } +} + +/*******************************************************************//** +Find the doc id in the query set but not in the deleted set, artificialy +downgrade or upgrade its ranking by a value and make/initialize its ranking +under or above its normal range 0 to 1. This is used for Boolean Search +operator such as Negation operator, which makes word's contribution to the +row's relevance to be negative */ +static +void +fts_query_change_ranking( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + ibool downgrade) /*!< in: Whether to downgrade ranking */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's in our set. */ + if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE; + + /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5) + and RANK_UPGRADE (0.5) */ + if (ranking->rank >= 1.0F) { + ranking->rank = 1.0F; + } else if (ranking->rank <= -1.0F) { + ranking->rank = -1.0F; + } + } +} + +/*******************************************************************//** +Check the doc id in the query set only if it's not in the +deleted array. The doc ids that were found are stored in +another rb tree (fts_query_t::intersect). */ +static +void +fts_query_intersect_doc_id( +/*=======================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + fts_ranking_t* ranking= NULL; + + /* There are three types of intersect: + 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'. + 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect + if it matches 'b'. if the doc is also in doc_ids, then change the + doc's rank, and add 'a' in doc's words. + 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect + if it matches 'b' and it's in doc_ids.(multi_exist = true). */ + + /* Check if the doc id is deleted and it's in our set */ + if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) { + fts_ranking_t new_ranking; + + if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + if (query->multi_exist) { + return; + } else { + new_ranking.words = NULL; + } + } else { + ranking = rbt_value(fts_ranking_t, parent.last); + + /* We've just checked the doc id before */ + if (ranking->words == NULL) { + ut_ad(rbt_search(query->intersection, &parent, + ranking) == 0); + return; + } + + /* Merge rank */ + rank += ranking->rank; + if (rank >= 1.0F) { + rank = 1.0F; + } else if (rank <= -1.0F) { + rank = -1.0F; + } + + /* Take words */ + new_ranking.words = ranking->words; + new_ranking.words_len = ranking->words_len; + } + + new_ranking.rank = rank; + new_ranking.doc_id = doc_id; + + if (rbt_search(query->intersection, &parent, + &new_ranking) != 0) { + if (new_ranking.words == NULL) { + fts_ranking_words_create(query, &new_ranking); + + query->total_size += RANKING_WORDS_INIT_LEN; + } else { + /* Note that the intersection has taken + ownership of the ranking data. */ + ranking->words = NULL; + } + + rbt_add_node(query->intersection, + &parent, &new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } + } +} + +/*******************************************************************//** +Free the document ranking rb tree. */ +static +void +fts_query_free_doc_ids( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + ib_rbt_t* doc_ids) /*!< in: rb tree to free */ +{ + const ib_rbt_node_t* node; + + for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + if (ranking->words) { + ranking->words = NULL; + } + + ut_free(rbt_remove_node(doc_ids, node)); + + ut_ad(query->total_size >= + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } + + rbt_free(doc_ids); + + ut_ad(query->total_size >= SIZEOF_RBT_CREATE); + query->total_size -= SIZEOF_RBT_CREATE; +} + +/*******************************************************************//** +Add the word to the documents "list" of matching words from +the query. We make a copy of the word from the query heap. */ +static +void +fts_query_add_word_to_document( +/*===========================*/ + fts_query_t* query, /*!< in: query to update */ + doc_id_t doc_id, /*!< in: the document to update */ + const fts_string_t* word) /*!< in: the token to add */ +{ + ib_rbt_bound_t parent; + fts_ranking_t* ranking = NULL; + + if (query->flags == FTS_OPT_RANKING) { + return; + } + + /* First we search the intersection RB tree as it could have + taken ownership of the words rb tree instance. */ + if (query->intersection + && rbt_search(query->intersection, &parent, &doc_id) == 0) { + + ranking = rbt_value(fts_ranking_t, parent.last); + } + + if (ranking == NULL + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + + ranking = rbt_value(fts_ranking_t, parent.last); + } + + if (ranking != NULL) { + fts_ranking_words_add(query, ranking, word); + } +} + +/*******************************************************************//** +Check the node ilist. */ +static +void +fts_query_check_node( +/*=================*/ + fts_query_t* query, /*!< in: query to update */ + const fts_string_t* token, /*!< in: the token to search */ + const fts_node_t* node) /*!< in: node to check */ +{ + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && ((query->upper_doc_id > 0 + && node->first_doc_id > query->upper_doc_id) + || (query->lower_doc_id > 0 + && node->last_doc_id < query->lower_doc_id))) { + + /* Ignore */ + + } else { + int ret; + ib_rbt_bound_t parent; + ulint ilist_size = node->ilist_size; + fts_word_freq_t*word_freqs; + + /* The word must exist. */ + ret = rbt_search(query->word_freqs, &parent, token); + ut_a(ret == 0); + + word_freqs = rbt_value(fts_word_freq_t, parent.last); + + query->error = fts_query_filter_doc_ids( + query, token, word_freqs, node, + node->ilist, ilist_size, TRUE); + } +} + +/*****************************************************************//** +Search index cache for word with wildcard match. +@return number of words matched */ +static +ulint +fts_cache_find_wildcard( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_index_cache_t*index_cache, /*!< in: cache to search */ + const fts_string_t* token) /*!< in: token to search */ +{ + ib_rbt_bound_t parent; + const ib_vector_t* nodes = NULL; + fts_string_t srch_text; + byte term[FTS_MAX_WORD_LEN + 1]; + ulint num_word = 0; + + srch_text.f_len = (token->f_str[token->f_len - 1] == '%') + ? token->f_len - 1 + : token->f_len; + + strncpy((char*) term, (char*) token->f_str, srch_text.f_len); + term[srch_text.f_len] = '\0'; + srch_text.f_str = term; + + /* Lookup the word in the rb tree */ + if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL, + innobase_fts_text_cmp_prefix) == 0) { + const fts_tokenizer_word_t* word; + ulint i; + const ib_rbt_node_t* cur_node; + ibool forward = FALSE; + + word = rbt_value(fts_tokenizer_word_t, parent.last); + cur_node = parent.last; + + while (innobase_fts_text_cmp_prefix( + index_cache->charset, &srch_text, &word->text) == 0) { + + nodes = word->nodes; + + for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + int ret; + const fts_node_t* node; + ib_rbt_bound_t freq_parent; + fts_word_freq_t* word_freqs; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + ret = rbt_search(query->word_freqs, + &freq_parent, + &srch_text); + + ut_a(ret == 0); + + word_freqs = rbt_value( + fts_word_freq_t, + freq_parent.last); + + query->error = fts_query_filter_doc_ids( + query, &srch_text, + word_freqs, node, + node->ilist, node->ilist_size, TRUE); + + if (query->error != DB_SUCCESS) { + return(0); + } + } + + num_word++; + + if (!forward) { + cur_node = rbt_prev( + index_cache->words, cur_node); + } else { +cont_search: + cur_node = rbt_next( + index_cache->words, cur_node); + } + + if (!cur_node) { + break; + } + + word = rbt_value(fts_tokenizer_word_t, cur_node); + } + + if (!forward) { + forward = TRUE; + cur_node = parent.last; + goto cont_search; + } + } + + return(num_word); +} + +/*****************************************************************//** +Set difference. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_difference( +/*=================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* token) /*!< in: token to search */ +{ + ulint n_doc_ids= 0; + trx_t* trx = query->trx; + dict_table_t* table = query->index->table; + + ut_a(query->oper == FTS_IGNORE); + +#ifdef FTS_INTERNAL_DIAG_PRINT + { + ib::info out; + out << "DIFFERENCE: Searching: '"; + out.write(token->f_str, token->f_len); + out << "'"; + } +#endif + + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids); + } + + /* There is nothing we can substract from an empty set. */ + if (query->doc_ids && !rbt_empty(query->doc_ids)) { + ulint i; + fts_fetch_t fetch; + const ib_vector_t* nodes; + const fts_index_cache_t*index_cache; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + dberr_t error; + + mysql_mutex_lock(&cache->lock); + + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache */ + ut_a(index_cache != NULL); + + /* Search the cache for a matching word first. */ + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + mysql_mutex_unlock(&cache->lock); + + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(query->error); + } + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + } + + /* The size can't increase. */ + ut_a(rbt_size(query->doc_ids) <= n_doc_ids); + + return(query->error); +} + +/* Free the query intersection +@param query query instance */ +static void fts_query_free_intersection(fts_query_t* query) +{ + fts_query_free_doc_ids(query, query->intersection); + query->intersection = NULL; +} + +/*****************************************************************//** +Intersect the token doc ids with the current set. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_intersect( +/*================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* token) /*!< in: the token to search */ +{ + trx_t* trx = query->trx; + dict_table_t* table = query->index->table; + + ut_a(query->oper == FTS_EXIST); + +#ifdef FTS_INTERNAL_DIAG_PRINT + { + ib::info out; + out << "INTERSECT: Searching: '"; + out.write(token->f_str, token->f_len); + out << "'"; + } +#endif + + /* If the words set is not empty and multi exist is true, + we know the intersection set is empty in advance. */ + if (!(rbt_empty(query->doc_ids) && query->multi_exist)) { + ulint n_doc_ids = 0; + ulint i; + fts_fetch_t fetch; + const ib_vector_t* nodes; + const fts_index_cache_t*index_cache; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + dberr_t error; + + ut_a(!query->intersection); + + n_doc_ids = rbt_size(query->doc_ids); + + /* Create the rb tree that will hold the doc ids of + the intersection. */ + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + + /* This is to avoid decompressing the ilist if the + node's ilist doc ids are out of range. */ + if (!rbt_empty(query->doc_ids) && query->multi_exist) { + const ib_rbt_node_t* node; + doc_id_t* doc_id; + + node = rbt_first(query->doc_ids); + doc_id = rbt_value(doc_id_t, node); + query->lower_doc_id = *doc_id; + + node = rbt_last(query->doc_ids); + doc_id = rbt_value(doc_id_t, node); + query->upper_doc_id = *doc_id; + + } else { + query->lower_doc_id = 0; + query->upper_doc_id = 0; + } + + /* Search the cache for a matching word first. */ + + mysql_mutex_lock(&cache->lock); + + /* Search for the index specific cache. */ + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache. */ + ut_a(index_cache != NULL); + + if (query->cur_node->term.wildcard) { + /* Wildcard search the index cache */ + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + mysql_mutex_unlock(&cache->lock); + + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + fts_query_free_intersection(query); + return(query->error); + } + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + + if (query->error == DB_SUCCESS) { + /* Make the intesection (rb tree) the current doc id + set and free the old set. */ + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + + ut_a(!query->multi_exist || (query->multi_exist + && rbt_size(query->doc_ids) <= n_doc_ids)); + } else if (query->intersection) { + fts_query_free_intersection(query); + } + } + + return(query->error); +} + +/*****************************************************************//** +Query index cache. +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_query_cache( +/*============*/ + fts_query_t* query, /*!< in/out: query instance */ + const fts_string_t* token) /*!< in: token to search */ +{ + const fts_index_cache_t*index_cache; + dict_table_t* table = query->index->table; + fts_cache_t* cache = table->fts->cache; + + /* Search the cache for a matching word first. */ + mysql_mutex_lock(&cache->lock); + + /* Search for the index specific cache. */ + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache. */ + ut_a(index_cache != NULL); + + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + /* Wildcard search the index cache */ + fts_cache_find_wildcard(query, index_cache, token); + } else { + const ib_vector_t* nodes; + ulint i; + + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast<const fts_node_t*>( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + mysql_mutex_unlock(&cache->lock); + + return(query->error); +} + +/*****************************************************************//** +Set union. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_union( +/*============*/ + fts_query_t* query, /*!< in: query instance */ + fts_string_t* token) /*!< in: token to search */ +{ + fts_fetch_t fetch; + ulint n_doc_ids = 0; + trx_t* trx = query->trx; + que_t* graph = NULL; + dberr_t error; + + ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING || + query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING); + +#ifdef FTS_INTERNAL_DIAG_PRINT + { + ib::info out; + out << "UNION: Searching: '"; + out.write(token->f_str, token->f_len); + out << "'"; + } +#endif + + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids); + } + + if (token->f_len == 0) { + return(query->error); + } + + fts_query_cache(query, token); + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + /* Read the nodes from disk. */ + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + + if (query->error == DB_SUCCESS) { + + /* The size can't decrease. */ + ut_a(rbt_size(query->doc_ids) >= n_doc_ids); + + /* Calulate the number of doc ids that were added to + the current doc id set. */ + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids; + } + } + + return(query->error); +} + +/*****************************************************************//** +Depending upon the current query operator process the doc id. +return DB_SUCCESS if all go well +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ +static +dberr_t +fts_query_process_doc_id( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: doc id to process */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + if (query->flags == FTS_OPT_RANKING) { + return(DB_SUCCESS); + } + + switch (query->oper) { + case FTS_NONE: + fts_query_union_doc_id(query, doc_id, rank); + break; + + case FTS_EXIST: + fts_query_intersect_doc_id(query, doc_id, rank); + break; + + case FTS_IGNORE: + fts_query_remove_doc_id(query, doc_id); + break; + + case FTS_NEGATE: + fts_query_change_ranking(query, doc_id, TRUE); + break; + + case FTS_DECR_RATING: + fts_query_union_doc_id(query, doc_id, rank); + fts_query_change_ranking(query, doc_id, TRUE); + break; + + case FTS_INCR_RATING: + fts_query_union_doc_id(query, doc_id, rank); + fts_query_change_ranking(query, doc_id, FALSE); + break; + + default: + ut_error; + } + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } +} + +/*****************************************************************//** +Merge two result sets. */ +static +dberr_t +fts_merge_doc_ids( +/*==============*/ + fts_query_t* query, /*!< in,out: query instance */ + const ib_rbt_t* doc_ids) /*!< in: result set to merge */ +{ + const ib_rbt_node_t* node; + + DBUG_ENTER("fts_merge_doc_ids"); + + ut_a(!query->intersection); + + /* To process FTS_EXIST operation (intersection), we need + to create a new result set for fts_query_intersect(). */ + if (query->oper == FTS_EXIST) { + + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + } + + /* Merge the elements to the result set. */ + for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) { + fts_ranking_t* ranking; + ulint pos = 0; + fts_string_t word; + + ranking = rbt_value(fts_ranking_t, node); + + query->error = fts_query_process_doc_id( + query, ranking->doc_id, ranking->rank); + + if (query->error != DB_SUCCESS) { + if (query->intersection) { + ut_a(query->oper == FTS_EXIST); + fts_query_free_intersection(query); + } + DBUG_RETURN(query->error); + } + + /* Merge words. Don't need to take operator into account. */ + ut_a(ranking->words); + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + fts_query_add_word_to_document(query, ranking->doc_id, + &word); + } + } + + /* If it is an intersection operation, reset query->doc_ids + to query->intersection and free the old result list. */ + if (query->oper == FTS_EXIST && query->intersection != NULL) { + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + } + + DBUG_RETURN(DB_SUCCESS); +} + +/*****************************************************************//** +Skip non-whitespace in a string. Move ptr to the next word boundary. +@return pointer to first whitespace character or end */ +UNIV_INLINE +byte* +fts_query_skip_word( +/*================*/ + byte* ptr, /*!< in: start of scan */ + const byte* end) /*!< in: pointer to end of string */ +{ + /* TODO: Does this have to be UTF-8 too ? */ + while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) { + ++ptr; + } + + return(ptr); +} + +/*****************************************************************//** +Check whether the remaining terms in the phrase match the text. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase_terms( +/*=========================*/ + fts_phrase_t* phrase, /*!< in: phrase to match */ + byte** start, /*!< in/out: text to search, we can't + make this const becase we need to + first convert the string to + lowercase */ + const byte* end, /*!< in: pointer to the end of + the string to search */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint i; + byte* ptr = *start; + const ib_vector_t* tokens = phrase->tokens; + ulint distance = phrase->distance; + + /* We check only from the second term onwards, since the first + must have matched otherwise we wouldn't be here. */ + for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) { + fts_string_t match; + fts_string_t cmp_str; + const fts_string_t* token; + int result; + ulint ret; + + ret = innobase_mysql_fts_get_token( + phrase->charset, ptr, + const_cast<byte*>(end), &match); + + if (match.f_len > 0) { + /* Get next token to match. */ + token = static_cast<const fts_string_t*>( + ib_vector_get_const(tokens, i)); + + fts_string_dup(&cmp_str, &match, heap); + + result = innobase_fts_text_case_cmp( + phrase->charset, token, &cmp_str); + + /* Skip the rest of the tokens if this one doesn't + match and the proximity distance is exceeded. */ + if (result + && (distance == ULINT_UNDEFINED + || distance == 0)) { + + break; + } + + /* This token matched move to the next token. */ + if (result == 0) { + /* Advance the text to search by the length + of the last token. */ + ptr += ret; + + /* Advance to the next token. */ + ++i; + } else { + + ut_a(distance != ULINT_UNDEFINED); + + ptr = fts_query_skip_word(ptr, end); + } + + /* Distance can be 0 for exact matches. */ + if (distance != ULINT_UNDEFINED && distance > 0) { + --distance; + } + } else { + ptr += ret; + } + } + + *start = ptr; + + /* Can't be greater than the number of elements. */ + ut_a(i <= ib_vector_size(tokens)); + + /* This is the case for multiple words. */ + if (i == ib_vector_size(tokens)) { + phrase->found = TRUE; + } + + return(phrase->found); +} + +/*****************************************************************//** +Callback function to count the number of words in position ranges, +and see whether the word count is in specified "phrase->distance" +@return true if the number of characters is less than the "distance" */ +static +bool +fts_proximity_is_word_in_range( +/*===========================*/ + const fts_phrase_t* + phrase, /*!< in: phrase with the search info */ + byte* start, /*!< in: text to search */ + ulint total_len) /*!< in: length of text */ +{ + fts_proximity_t* proximity_pos = phrase->proximity_pos; + + ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size()); + ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size()); + + /* Search each matched position pair (with min and max positions) + and count the number of words in the range */ + for (ulint i = 0; i < proximity_pos->n_pos; i++) { + ulint cur_pos = proximity_pos->min_pos[i]; + ulint n_word = 0; + + ut_ad(proximity_pos->max_pos[i] <= total_len); + + /* Walk through words in the range and count them */ + while (cur_pos <= proximity_pos->max_pos[i]) { + ulint len; + fts_string_t str; + + len = innobase_mysql_fts_get_token( + phrase->charset, + start + cur_pos, + start + total_len, &str); + + if (len == 0) { + break; + } + + /* Advances position with "len" bytes */ + cur_pos += len; + + /* Record the number of words */ + if (str.f_n_char > 0) { + n_word++; + } + + if (n_word > phrase->distance) { + break; + } + } + + /* Check if the number of words is less than specified + "distance" */ + if (n_word && n_word <= phrase->distance) { + return(true); + } + } + + return(false); +} + +/*****************************************************************//** +FTS plugin parser 'myql_add_word' callback function for phrase match +Refer to 'st_mysql_ftparser_param' for more detail. +@return 0 if match, or return non-zero */ +static +int +fts_query_match_phrase_add_word_for_parser( +/*=======================================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */ + const char* word, /*!< in: token */ + int word_len, /*!< in: token length */ + MYSQL_FTPARSER_BOOLEAN_INFO*) +{ + fts_phrase_param_t* phrase_param; + fts_phrase_t* phrase; + const ib_vector_t* tokens; + fts_string_t match; + fts_string_t cmp_str; + const fts_string_t* token; + int result; + mem_heap_t* heap; + + phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam); + heap = phrase_param->heap; + phrase = phrase_param->phrase; + tokens = phrase->tokens; + + /* In case plugin parser doesn't check return value */ + if (phrase_param->token_index == ib_vector_size(tokens)) { + return(1); + } + + match.f_str = (uchar *)(word); + match.f_len = ulint(word_len); + match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len); + + if (match.f_len > 0) { + /* Get next token to match. */ + ut_a(phrase_param->token_index < ib_vector_size(tokens)); + token = static_cast<const fts_string_t*>( + ib_vector_get_const(tokens, phrase_param->token_index)); + + fts_string_dup(&cmp_str, &match, heap); + + result = innobase_fts_text_case_cmp( + phrase->charset, token, &cmp_str); + + if (result == 0) { + phrase_param->token_index++; + } else { + return(1); + } + } + + /* Can't be greater than the number of elements. */ + ut_a(phrase_param->token_index <= ib_vector_size(tokens)); + + /* This is the case for multiple words. */ + if (phrase_param->token_index == ib_vector_size(tokens)) { + phrase->found = TRUE; + } + + return(static_cast<int>(phrase->found)); +} + +/*****************************************************************//** +Check whether the terms in the phrase match the text. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase_terms_by_parser( +/*===================================*/ + fts_phrase_param_t* phrase_param, /* in/out: phrase param */ + st_mysql_ftparser* parser, /* in: plugin fts parser */ + byte* text, /* in: text to check */ + ulint len) /* in: text length */ +{ + MYSQL_FTPARSER_PARAM param; + + ut_a(parser); + + /* Set paramters for param */ + param.mysql_parse = fts_tokenize_document_internal; + param.mysql_add_word = fts_query_match_phrase_add_word_for_parser; + param.mysql_ftparam = phrase_param; + param.cs = phrase_param->phrase->charset; + param.doc = reinterpret_cast<char*>(text); + param.length = static_cast<int>(len); + param.mode= MYSQL_FTPARSER_WITH_STOPWORDS; + + PARSER_INIT(parser, ¶m); + parser->parse(¶m); + PARSER_DEINIT(parser, ¶m); + + return(phrase_param->phrase->found); +} + +/*****************************************************************//** +Callback function to fetch and search the document. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase( +/*===================*/ + fts_phrase_t* phrase, /*!< in: phrase to match */ + byte* start, /*!< in: text to search, we can't make + this const becase we need to first + convert the string to lowercase */ + ulint cur_len, /*!< in: length of text */ + ulint prev_len, /*!< in: total length for searched + doc fields*/ + mem_heap_t* heap) /* heap */ +{ + ulint i; + const fts_string_t* first; + const byte* end = start + cur_len; + const ib_vector_t* tokens = phrase->tokens; + const ib_vector_t* positions = phrase->match->positions; + + ut_a(!phrase->found); + ut_a(phrase->match->doc_id > 0); + ut_a(ib_vector_size(tokens) > 0); + ut_a(ib_vector_size(positions) > 0); + + first = static_cast<const fts_string_t*>( + ib_vector_get_const(tokens, 0)); + + ut_a(phrase->match->start < ib_vector_size(positions)); + + for (i = phrase->match->start; i < ib_vector_size(positions); ++i) { + ulint pos; + byte* ptr = start; + + pos = *(ulint*) ib_vector_get_const(positions, i); + + if (pos == ULINT_UNDEFINED) { + break; + } + + if (pos < prev_len) { + continue; + } + + /* Document positions are calculated from the beginning + of the first field, need to save the length for each + searched field to adjust the doc position when search + phrases. */ + pos -= prev_len; + ptr = start + pos; + + /* Within limits ? */ + if (ptr >= end) { + break; + } + + if (phrase->parser) { + fts_phrase_param_t phrase_param; + + phrase_param.phrase = phrase; + phrase_param.token_index = 0; + phrase_param.heap = heap; + + if (fts_query_match_phrase_terms_by_parser( + &phrase_param, + phrase->parser, + ptr, + ulint(end - ptr))) { + break; + } + } else { + fts_string_t match; + fts_string_t cmp_str; + ulint ret; + + match.f_str = ptr; + ret = innobase_mysql_fts_get_token( + phrase->charset, start + pos, + const_cast<byte*>(end), &match); + + if (match.f_len == 0) { + break; + } + + fts_string_dup(&cmp_str, &match, heap); + + if (innobase_fts_text_case_cmp( + phrase->charset, first, &cmp_str) == 0) { + + /* This is the case for the single word + in the phrase. */ + if (ib_vector_size(phrase->tokens) == 1) { + phrase->found = TRUE; + break; + } + + ptr += ret; + + /* Match the remaining terms in the phrase. */ + if (fts_query_match_phrase_terms(phrase, &ptr, + end, heap)) { + break; + } + } + } + } + + return(phrase->found); +} + +/*****************************************************************//** +Callback function to fetch and search the document. +@return whether the phrase is found */ +static +ibool +fts_query_fetch_document( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + + que_node_t* exp; + sel_node_t* node = static_cast<sel_node_t*>(row); + fts_phrase_t* phrase = static_cast<fts_phrase_t*>(user_arg); + ulint prev_len = 0; + ulint total_len = 0; + byte* document_text = NULL; + + exp = node->select_list; + + phrase->found = FALSE; + + /* For proximity search, we will need to get the whole document + from all fields, so first count the total length of the document + from all the fields */ + if (phrase->proximity_pos) { + while (exp) { + ulint field_len; + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + + if (dfield_is_ext(dfield)) { + ulint local_len = dfield_get_len(dfield); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + field_len = mach_read_from_4( + data + local_len + BTR_EXTERN_LEN + 4); + } else { + field_len = dfield_get_len(dfield); + } + + if (field_len != UNIV_SQL_NULL) { + total_len += field_len + 1; + } + + exp = que_node_get_next(exp); + } + + document_text = static_cast<byte*>(mem_heap_zalloc( + phrase->heap, total_len)); + + if (!document_text) { + return(FALSE); + } + } + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + ulint cur_len; + + if (dfield_is_ext(dfield)) { + data = btr_copy_externally_stored_field( + &cur_len, data, phrase->zip_size, + dfield_get_len(dfield), phrase->heap); + } else { + cur_len = dfield_get_len(dfield); + } + + if (cur_len != UNIV_SQL_NULL && cur_len != 0) { + if (phrase->proximity_pos) { + ut_ad(prev_len + cur_len <= total_len); + memcpy(document_text + prev_len, data, cur_len); + } else { + /* For phrase search */ + phrase->found = + fts_query_match_phrase( + phrase, + static_cast<byte*>(data), + cur_len, prev_len, + phrase->heap); + } + + /* Document positions are calculated from the beginning + of the first field, need to save the length for each + searched field to adjust the doc position when search + phrases. */ + prev_len += cur_len + 1; + } + + if (phrase->found) { + break; + } + + exp = que_node_get_next(exp); + } + + if (phrase->proximity_pos) { + ut_ad(prev_len <= total_len); + + phrase->found = fts_proximity_is_word_in_range( + phrase, document_text, total_len); + } + + return(phrase->found); +} + +#if 0 +/******************************************************************** +Callback function to check whether a record was found or not. */ +static +ibool +fts_query_select( +/*=============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + int i; + que_node_t* exp; + sel_node_t* node = row; + fts_select_t* select = user_arg; + + ut_a(select->word_freq); + ut_a(select->word_freq->doc_freqs); + + exp = node->select_list; + + for (i = 0; exp && !select->found; ++i) { + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + switch (i) { + case 0: /* DOC_COUNT */ + if (len != UNIV_SQL_NULL && len != 0) { + + select->word_freq->doc_count += + mach_read_from_4(data); + } + break; + + case 1: /* ILIST */ + if (len != UNIV_SQL_NULL && len != 0) { + + fts_query_find_doc_id(select, data, len); + } + break; + + default: + ut_error; + } + + exp = que_node_get_next(exp); + } + + return(FALSE); +} + +/******************************************************************** +Read the rows from the FTS index, that match word and where the +doc id is between first and last doc id. +@return DB_SUCCESS if all go well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_find_term( +/*================*/ + fts_query_t* query, /*!< in: FTS query state */ + que_t** graph, /*!< in: prepared statement */ + const fts_string_t* word, /*!< in: the word to fetch */ + doc_id_t doc_id, /*!< in: doc id to match */ + ulint* min_pos,/*!< in/out: pos found must be + greater than this minimum value. */ + ibool* found) /*!< out: TRUE if found else FALSE */ +{ + pars_info_t* info; + dberr_t error; + fts_select_t select; + doc_id_t match_doc_id; + trx_t* trx = query->trx; + char table_name[MAX_FULL_NAME_LEN]; + + trx->op_info = "fetching FTS index matching nodes"; + + if (*graph) { + info = (*graph)->info; + } else { + ulint selected; + + info = pars_info_create(); + + selected = fts_select_index(*word->f_str); + query->fts_index_table.suffix = fts_get_suffix(selected); + + fts_get_table_name(&query->fts_index_table, table_name); + pars_info_bind_id(info, "index_table_name", table_name); + } + + select.found = FALSE; + select.doc_id = doc_id; + select.min_pos = *min_pos; + select.word_freq = fts_query_add_word_freq(query, word->f_str); + + pars_info_bind_function(info, "my_func", fts_query_select, &select); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &match_doc_id, doc_id); + + fts_bind_doc_id(info, "min_doc_id", &match_doc_id); + + fts_bind_doc_id(info, "max_doc_id", &match_doc_id); + + if (!*graph) { + + *graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count, ilist\n" + " FROM $index_table_name\n" + " WHERE word LIKE :word AND" + " first_doc_id <= :min_doc_id AND" + " last_doc_id >= :max_doc_id\n" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + } + + for (;;) { + error = fts_eval_sql(trx, *graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading FTS" + " index. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << error + << " while reading FTS index."; + + break; /* Exit the loop. */ + } + } + } + + /* Value to return */ + *found = select.found; + + if (*found) { + *min_pos = select.min_pos; + } + + return(error); +} + +/******************************************************************** +Callback aggregator for int columns. */ +static +ibool +fts_query_sum( +/*==========*/ + /*!< out: always returns TRUE */ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: ulint* */ +{ + + que_node_t* exp; + sel_node_t* node = row; + ulint* total = user_arg; + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + if (len != UNIV_SQL_NULL && len != 0) { + *total += mach_read_from_4(data); + } + + exp = que_node_get_next(exp); + } + + return(TRUE); +} + +/******************************************************************** +Calculate the total documents that contain a particular word (term). +@return DB_SUCCESS if all go well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_total_docs_containing_term( +/*=================================*/ + fts_query_t* query, /*!< in: FTS query state */ + const fts_string_t* word, /*!< in: the word to check */ + ulint* total) /*!< out: documents containing word */ +{ + pars_info_t* info; + dberr_t error; + que_t* graph; + ulint selected; + trx_t* trx = query->trx; + char table_name[MAX_FULL_NAME_LEN] + + trx->op_info = "fetching FTS index document count"; + + *total = 0; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_query_sum, total); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + selected = fts_select_index(*word->f_str); + + query->fts_index_table.suffix = fts_get_suffix(selected); + + fts_get_table_name(&query->fts_index_table, table_name); + + pars_info_bind_id(info, "index_table_name", table_name); + + graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count\n" + " FROM $index_table_name\n" + " WHERE word = :word" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading FTS" + " index. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << error + << " while reading FTS index."; + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + return(error); +} + +/******************************************************************** +Get the total number of words in a documents. +@return DB_SUCCESS if all go well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_terms_in_document( +/*========================*/ + fts_query_t* query, /*!< in: FTS query state */ + doc_id_t doc_id, /*!< in: the word to check */ + ulint* total) /*!< out: total words in document */ +{ + pars_info_t* info; + dberr_t error; + que_t* graph; + doc_id_t read_doc_id; + trx_t* trx = query->trx; + char table_name[MAX_FULL_NAME_LEN]; + + trx->op_info = "fetching FTS document term count"; + + *total = 0; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_query_sum, total); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &read_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &read_doc_id); + + query->fts_index_table.suffix = "DOC_ID"; + + fts_get_table_name(&query->fts_index_table, table_name); + + pars_info_bind_id(info, "index_table_name", table_name); + + graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT count\n" + " FROM $index_table_name\n" + " WHERE doc_id = :doc_id" + " BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading FTS" + " doc id table. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << error << " while reading FTS" + " doc id table."; + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + return(error); +} +#endif + +/*****************************************************************//** +Retrieve the document and match the phrase tokens. +@return DB_SUCCESS or error code */ +MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result)) +static +dberr_t +fts_query_match_document( +/*=====================*/ + ib_vector_t* tokens, /*!< in: phrase tokens */ + fts_get_doc_t* get_doc, /*!< in: table and prepared statements */ + fts_match_t* match, /*!< in: doc id and positions */ + ulint distance, /*!< in: proximity distance */ + st_mysql_ftparser* parser, /*!< in: fts plugin parser */ + ibool* found) /*!< out: TRUE if phrase found */ +{ + dberr_t error; + fts_phrase_t phrase(get_doc->index_cache->index->table); + + phrase.match = match; /* Positions to match */ + phrase.tokens = tokens; /* Tokens to match */ + phrase.distance = distance; + phrase.charset = get_doc->index_cache->charset; + phrase.heap = mem_heap_create(512); + phrase.parser = parser; + + *found = phrase.found = FALSE; + + error = fts_doc_fetch_by_doc_id( + get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") matching document."; + } else { + *found = phrase.found; + } + + mem_heap_free(phrase.heap); + + return(error); +} + +/*****************************************************************//** +This function fetches the original documents and count the +words in between matching words to see that is in specified distance +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +fts_query_is_in_proximity_range( +/*============================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_match_t** match, /*!< in: query instance */ + fts_proximity_t* qualified_pos) /*!< in: position info for + qualified ranges */ +{ + fts_get_doc_t get_doc; + fts_cache_t* cache = query->index->table->fts->cache; + dberr_t err; + + memset(&get_doc, 0x0, sizeof(get_doc)); + + mysql_mutex_lock(&cache->lock); + get_doc.index_cache = fts_find_index_cache(cache, query->index); + mysql_mutex_unlock(&cache->lock); + ut_a(get_doc.index_cache != NULL); + + fts_phrase_t phrase(get_doc.index_cache->index->table); + + phrase.distance = query->distance; + phrase.charset = get_doc.index_cache->charset; + phrase.heap = mem_heap_create(512); + phrase.proximity_pos = qualified_pos; + phrase.found = FALSE; + + err = fts_doc_fetch_by_doc_id( + &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + ib::error() << "(" << err << ") in verification" + " phase of proximity search"; + } + + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + mem_heap_free(phrase.heap); + + return(err == DB_SUCCESS && phrase.found); +} + +/*****************************************************************//** +Iterate over the matched document ids and search the for the +actual phrase in the text. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_search_phrase( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + ib_vector_t* orig_tokens, /*!< in: tokens to search, + with any stopwords in the + original phrase */ + ib_vector_t* tokens) /*!< in: tokens that does + not include stopwords and + can be used to calculate + ranking */ +{ + ulint i; + fts_get_doc_t get_doc; + ulint n_matched; + fts_cache_t* cache = query->index->table->fts->cache; + + n_matched = ib_vector_size(query->matched); + + /* Setup the doc retrieval infrastructure. */ + memset(&get_doc, 0x0, sizeof(get_doc)); + + mysql_mutex_lock(&cache->lock); + + get_doc.index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache */ + ut_a(get_doc.index_cache != NULL); + + mysql_mutex_unlock(&cache->lock); + +#ifdef FTS_INTERNAL_DIAG_PRINT + ib::info() << "Start phrase search"; +#endif + + /* Read the document from disk and do the actual + match, matching documents will be added to the current + doc id set. */ + for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) { + fts_match_t* match; + ibool found = FALSE; + + match = static_cast<fts_match_t*>( + ib_vector_get(query->matched, i)); + + /* Skip the document ids that were filtered out by + an earlier pass. */ + if (match->doc_id != 0) { + + query->error = fts_query_match_document( + orig_tokens, &get_doc, match, + query->distance, query->parser, &found); + + if (query->error == DB_SUCCESS && found) { + ulint z; + + query->error = fts_query_process_doc_id(query, + match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + + for (z = 0; z < ib_vector_size(tokens); z++) { + fts_string_t* token; + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match->doc_id, token); + } + } + } + } + +func_exit: + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + return(query->error); +} + +/** Split the phrase into tokens +@param[in,out] query query instance +@param[in] node query node to search +@param[in,out] tokens token vector +@param[in,out] orig_tokens original node tokens include stopword +@param[in,out] heap mem heap */ +static +void +fts_query_phrase_split( + fts_query_t* query, + const fts_ast_node_t* node, + ib_vector_t* tokens, + ib_vector_t* orig_tokens, + mem_heap_t* heap) +{ + fts_string_t phrase; + ulint len = 0; + ulint cur_pos = 0; + fts_ast_node_t* term_node = NULL; + + if (node->type == FTS_AST_TEXT) { + phrase.f_str = node->text.ptr->str; + phrase.f_len = node->text.ptr->len; + len = phrase.f_len; + } else { + ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST); + phrase.f_str = NULL; + phrase.f_len = 0; + term_node = node->list.head; + } + + while (true) { + fts_cache_t* cache = query->index->table->fts->cache; + ulint cur_len; + fts_string_t result_str; + + if (node->type == FTS_AST_TEXT) { + if (cur_pos >= len) { + break; + } + + cur_len = innobase_mysql_fts_get_token( + query->fts_index_table.charset, + reinterpret_cast<const byte*>(phrase.f_str) + + cur_pos, + reinterpret_cast<const byte*>(phrase.f_str) + + len, + &result_str); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + } else { + ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST); + /* Term node in parser phrase list */ + if (term_node == NULL) { + break; + } + + ut_a(term_node->type == FTS_AST_TERM); + result_str.f_str = term_node->term.ptr->str; + result_str.f_len = term_node->term.ptr->len; + result_str.f_n_char = fts_get_token_size( + query->fts_index_table.charset, + reinterpret_cast<char*>(result_str.f_str), + result_str.f_len); + + term_node = term_node->next; + } + + if (result_str.f_n_char == 0) { + continue; + } + + fts_string_t* token = static_cast<fts_string_t*>( + ib_vector_push(tokens, NULL)); + fts_string_dup(token, &result_str, heap); + + if (fts_check_token( + &result_str, + cache->stopword_info.cached_stopword, + query->fts_index_table.charset)) { + /* Add the word to the RB tree so that we can + calculate it's frequencey within a document. */ + fts_query_add_word_freq(query, token); + } else { + ib_vector_pop(tokens); + } + + /* we will start to store all words including stopwords + in the "orig_tokens" vector, but skip any leading words + that are stopwords */ + if (!ib_vector_is_empty(tokens)) { + fts_string_t* orig_token = static_cast<fts_string_t*>( + ib_vector_push(orig_tokens, NULL)); + + orig_token->f_str = token->f_str; + orig_token->f_len = token->f_len; + } + } +} + +/*****************************************************************//** +Text/Phrase search. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +fts_query_phrase_search( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_ast_node_t* node) /*!< in: node to search */ +{ + ib_vector_t* tokens; + ib_vector_t* orig_tokens; + mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t)); + ib_alloc_t* heap_alloc; + ulint num_token; + + heap_alloc = ib_heap_allocator_create(heap); + + tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + + if (query->distance != ULINT_UNDEFINED && query->distance > 0) { + query->flags = FTS_PROXIMITY; + } else { + query->flags = FTS_PHRASE; + } + + /* Split the phrase into tokens. */ + fts_query_phrase_split(query, node, tokens, orig_tokens, heap); + + num_token = ib_vector_size(tokens); + if (num_token > MAX_PROXIMITY_ITEM) { + query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE; + goto func_exit; + } + + ut_ad(ib_vector_size(orig_tokens) >= num_token); + + /* Ignore empty strings. */ + if (num_token > 0) { + fts_string_t* token = NULL; + fts_fetch_t fetch; + trx_t* trx = query->trx; + fts_ast_oper_t oper = query->oper; + que_t* graph = NULL; + ulint i; + dberr_t error; + + /* Create the vector for storing matching document ids + and the positions of the first token of the phrase. */ + if (!query->matched) { + ib_alloc_t* heap_alloc; + + heap_alloc = ib_heap_allocator_create(heap); + + if (!(query->flags & FTS_PROXIMITY) + && !(query->flags & FTS_PHRASE)) { + query->matched = ib_vector_create( + heap_alloc, sizeof(fts_match_t), + 64); + } else { + ut_a(num_token <= MAX_PROXIMITY_ITEM); + query->match_array = + (ib_vector_t**) mem_heap_alloc( + heap, + num_token * + sizeof(query->matched)); + + for (i = 0; i < num_token; i++) { + query->match_array[i] = + ib_vector_create( + heap_alloc, sizeof(fts_match_t), + 64); + } + + query->matched = query->match_array[0]; + } + } + + /* Setup the callback args for filtering and consolidating + the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + for (i = 0; i < num_token; i++) { + /* Search for the first word from the phrase. */ + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, i)); + + if (query->flags & FTS_PROXIMITY + || query->flags & FTS_PHRASE) { + query->matched = query->match_array[i]; + } + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, + token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + graph = NULL; + + fts_query_cache(query, token); + + if (!(query->flags & FTS_PHRASE) + && !(query->flags & FTS_PROXIMITY)) { + break; + } + + /* If any of the token can't be found, + no need to continue match */ + if (ib_vector_is_empty(query->match_array[i]) + || query->error != DB_SUCCESS) { + goto func_exit; + } + } + + /* Just a single word, no need to fetch the original + documents to do phrase matching */ + if (ib_vector_size(orig_tokens) == 1 + && !ib_vector_is_empty(query->match_array[0])) { + fts_match_t* match; + ulint n_matched; + + n_matched = ib_vector_size(query->match_array[0]); + + for (i = 0; i < n_matched; i++) { + match = static_cast<fts_match_t*>( + ib_vector_get( + query->match_array[0], i)); + + query->error = fts_query_process_doc_id( + query, match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + + fts_query_add_word_to_document( + query, match->doc_id, token); + } + query->oper = oper; + goto func_exit; + } + + /* If we are doing proximity search, verify the distance + between all words, and check they are in specified distance. */ + if (query->flags & FTS_PROXIMITY) { + fts_phrase_or_proximity_search(query, tokens); + } else { + ibool matched; + + /* Phrase Search case: + We filter out the doc ids that don't contain + all the tokens in the phrase. It's cheaper to + search the ilist than bringing the documents in + and then doing a search through the text. Isolated + testing shows this also helps in mitigating disruption + of the buffer cache. */ + matched = fts_phrase_or_proximity_search(query, tokens); + query->matched = query->match_array[0]; + + /* Read the actual text in and search for the phrase. */ + if (matched) { + ut_ad(query->error == DB_SUCCESS); + query->error = fts_query_search_phrase( + query, orig_tokens, tokens); + } + } + + /* Restore original operation. */ + query->oper = oper; + + if (query->error != DB_SUCCESS) { + goto func_exit; + } + } + +func_exit: + mem_heap_free(heap); + + /* Don't need it anymore. */ + query->matched = NULL; + + return(query->error); +} + +/*****************************************************************//** +Find the word and evaluate. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_execute( +/*==============*/ + fts_query_t* query, /*!< in: query instance */ + fts_string_t* token) /*!< in: token to search */ +{ + switch (query->oper) { + case FTS_NONE: + case FTS_NEGATE: + case FTS_INCR_RATING: + case FTS_DECR_RATING: + query->error = fts_query_union(query, token); + break; + + case FTS_EXIST: + query->error = fts_query_intersect(query, token); + break; + + case FTS_IGNORE: + query->error = fts_query_difference(query, token); + break; + + default: + ut_error; + } + + return(query->error); +} + +/*****************************************************************//** +Create a wildcard string. It's the responsibility of the caller to +free the byte* pointer. It's allocated using ut_malloc_nokey(). +@return ptr to allocated memory */ +static +byte* +fts_query_get_token( +/*================*/ + fts_ast_node_t* node, /*!< in: the current sub tree */ + fts_string_t* token) /*!< in: token to create */ +{ + ulint str_len; + byte* new_ptr = NULL; + + str_len = node->term.ptr->len; + + ut_a(node->type == FTS_AST_TERM); + + token->f_len = str_len; + token->f_str = node->term.ptr->str; + + if (node->term.wildcard) { + + token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2)); + token->f_len = str_len + 1; + + memcpy(token->f_str, node->term.ptr->str, str_len); + + token->f_str[str_len] = '%'; + token->f_str[token->f_len] = 0; + + new_ptr = token->f_str; + } + + return(new_ptr); +} + +static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*); + +/*****************************************************************//** +Visit every node of the AST. */ +static +dberr_t +fts_query_visitor( +/*==============*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: The root of the current subtree*/ + void* arg) /*!< in: callback arg*/ +{ + byte* ptr; + fts_string_t token; + fts_query_t* query = static_cast<fts_query_t*>(arg); + + ut_a(node); + DBUG_ENTER("fts_query_visitor"); + DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type))); + + token.f_n_char = 0; + query->oper = oper; + query->cur_node = node; + + switch (node->type) { + case FTS_AST_TEXT: + case FTS_AST_PARSER_PHRASE_LIST: + + if (query->oper == FTS_EXIST) { + ut_ad(query->intersection == NULL); + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + } + + /* Set the current proximity distance. */ + query->distance = node->text.distance; + + /* Force collection of doc ids and the positions. */ + query->collect_positions = TRUE; + + query->error = fts_query_phrase_search(query, node); + + query->collect_positions = FALSE; + + if (query->oper == FTS_EXIST) { + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + } + + break; + + case FTS_AST_TERM: + token.f_str = node->term.ptr->str; + token.f_len = node->term.ptr->len; + + /* Collect wildcard words for QUERY EXPANSION. */ + if (node->term.wildcard && query->wildcard_words != NULL) { + ib_rbt_bound_t parent; + + if (rbt_search(query->wildcard_words, &parent, &token) + != 0) { + fts_string_t word; + + fts_string_dup(&word, &token, query->heap); + rbt_add_node(query->wildcard_words, &parent, + &word); + } + } + + /* Add the word to our RB tree that will be used to + calculate this terms per document frequency. */ + fts_query_add_word_freq(query, &token); + + ptr = fts_query_get_token(node, &token); + query->error = fts_query_execute(query, &token); + + if (ptr) { + ut_free(ptr); + } + + break; + + case FTS_AST_SUBEXP_LIST: + query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg); + break; + + default: + ut_error; + } + + if (query->oper == FTS_EXIST) { + query->multi_exist = true; + } + + DBUG_RETURN(query->error); +} + +/** Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@param[in,out] node current root node +@param[in,out] visitor callback function +@param[in,out] arg argument for callback +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_ast_visit_sub_exp( + fts_ast_node_t* node, + fts_ast_callback visitor, + void* arg) +{ + fts_ast_oper_t cur_oper; + fts_query_t* query = static_cast<fts_query_t*>(arg); + ib_rbt_t* parent_doc_ids; + ib_rbt_t* subexpr_doc_ids; + dberr_t error = DB_SUCCESS; + bool will_be_ignored = false; + bool multi_exist; + + DBUG_ENTER("fts_ast_visit_sub_exp"); + + ut_a(node->type == FTS_AST_SUBEXP_LIST); + + /* To avoid stack overflow, we limit the mutual recursion + depth between fts_ast_visit(), fts_query_visitor() and + fts_ast_visit_sub_exp(). */ + if (query->visiting_sub_exp++ > 31) { + query->error = DB_OUT_OF_MEMORY; + DBUG_RETURN(query->error); + } + + cur_oper = query->oper; + + /* Save current result set */ + parent_doc_ids = query->doc_ids; + + /* Create new result set to store the sub-expression result. We + will merge this result set with the parent after processing. */ + query->doc_ids = rbt_create(sizeof(fts_ranking_t), + fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + + multi_exist = query->multi_exist; + query->multi_exist = false; + /* Process nodes in current sub-expression and store its + result set in query->doc_ids we created above. */ + error = fts_ast_visit(FTS_NONE, node, visitor, + arg, &will_be_ignored); + + /* Reinstate parent node state */ + query->multi_exist = multi_exist; + query->oper = cur_oper; + query->visiting_sub_exp--; + + /* Merge the sub-expression result with the parent result set. */ + subexpr_doc_ids = query->doc_ids; + query->doc_ids = parent_doc_ids; + if (error == DB_SUCCESS) { + error = fts_merge_doc_ids(query, subexpr_doc_ids); + } + + /* Free current result set. Result already merged into parent. */ + fts_query_free_doc_ids(query, subexpr_doc_ids); + + DBUG_RETURN(error); +} + +#if 0 +/*****************************************************************//*** +Check if the doc id exists in the ilist. +@return TRUE if doc id found */ +static +ulint +fts_query_find_doc_id( +/*==================*/ + fts_select_t* select, /*!< in/out: contains the doc id to + find, we update the word freq if + document found */ + void* data, /*!< in: doc id ilist */ + ulint len) /*!< in: doc id ilist size */ +{ + byte* ptr = data; + doc_id_t doc_id = 0; + ulint decoded = 0; + + /* Decode the ilist and search for selected doc_id. We also + calculate the frequency of the word in the document if found. */ + while (decoded < len && !select->found) { + ulint freq = 0; + ulint min_pos = 0; + ulint last_pos = 0; + ulint pos = fts_decode_vlc(&ptr); + + /* Add the delta. */ + doc_id += pos; + + while (*ptr) { + ++freq; + last_pos += fts_decode_vlc(&ptr); + + /* Only if min_pos is not set and the current + term exists in a position greater than the + min_pos of the previous term. */ + if (min_pos == 0 && last_pos > select->min_pos) { + min_pos = last_pos; + } + } + + /* Skip the end of word position marker. */ + ++ptr; + + /* Bytes decoded so far. */ + decoded = ptr - (byte*) data; + + /* A word may exist in the document but we only consider a + match if it exists in a position that is greater than the + position of the previous term. */ + if (doc_id == select->doc_id && min_pos > 0) { + fts_doc_freq_t* doc_freq; + + /* Add the doc id to the doc freq rb tree, if + the doc id doesn't exist it will be created. */ + doc_freq = fts_query_add_doc_freq( + select->word_freq->doc_freqs, doc_id); + + /* Avoid duplicating the frequency tally */ + if (doc_freq->freq == 0) { + doc_freq->freq = freq; + } + + select->found = TRUE; + select->min_pos = min_pos; + } + } + + return(select->found); +} +#endif + +/*****************************************************************//** +Read and filter nodes. +@return DB_SUCCESS if all go well, +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ +static +dberr_t +fts_query_filter_doc_ids( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: the current word */ + fts_word_freq_t* word_freq, /*!< in/out: word frequency */ + const fts_node_t* node, /*!< in: current FTS node */ + void* data, /*!< in: doc id ilist */ + ulint len, /*!< in: doc id ilist size */ + ibool calc_doc_count) /*!< in: whether to remember doc count */ +{ + const byte* ptr = static_cast<byte*>(data); + doc_id_t doc_id = 0; + ulint decoded = 0; + ib_rbt_t* doc_freqs = word_freq->doc_freqs; + + /* Decode the ilist and add the doc ids to the query doc_id set. */ + while (decoded < len) { + ulint freq = 0; + fts_doc_freq_t* doc_freq; + fts_match_t* match = NULL; + doc_id_t last_pos = 0; + doc_id_t pos = fts_decode_vlc(&ptr); + + /* Some sanity checks. */ + if (doc_id == 0) { + ut_a(pos == node->first_doc_id); + } + + /* Add the delta. */ + doc_id += pos; + + if (calc_doc_count) { + word_freq->doc_count++; + } + + /* We simply collect the matching instances here. */ + if (query->collect_positions) { + ib_alloc_t* heap_alloc; + + /* Create a new fts_match_t instance. */ + match = static_cast<fts_match_t*>( + ib_vector_push(query->matched, NULL)); + + match->start = 0; + match->doc_id = doc_id; + heap_alloc = ib_vector_allocator(query->matched); + + /* Allocate from the same heap as the + parent container. */ + match->positions = ib_vector_create( + heap_alloc, sizeof(ulint), 64); + + query->total_size += sizeof(fts_match_t) + + sizeof(ib_vector_t) + + sizeof(ulint) * 64; + } + + /* Unpack the positions within the document. */ + while (*ptr) { + last_pos += fts_decode_vlc(&ptr); + + /* Collect the matching word positions, for phrase + matching later. */ + if (query->collect_positions) { + ib_vector_push(match->positions, &last_pos); + } + + ++freq; + } + + /* End of list marker. */ + last_pos = (ulint) -1; + + if (query->collect_positions) { + ut_a(match != NULL); + ib_vector_push(match->positions, &last_pos); + } + + /* Add the doc id to the doc freq rb tree, if the doc id + doesn't exist it will be created. */ + doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id); + + /* Avoid duplicating frequency tally. */ + if (doc_freq->freq == 0) { + doc_freq->freq = freq; + } + + /* Skip the end of word position marker. */ + ++ptr; + + /* Bytes decoded so far */ + decoded = ulint(ptr - (byte*) data); + + /* We simply collect the matching documents and the + positions here and match later. */ + if (!query->collect_positions) { + /* We ignore error here and will check it later */ + fts_query_process_doc_id(query, doc_id, 0); + + /* Add the word to the document's matched RB tree. */ + fts_query_add_word_to_document(query, doc_id, word); + } + } + + /* Some sanity checks. */ + ut_a(doc_id == node->last_doc_id); + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } +} + +/*****************************************************************//** +Read the FTS INDEX row. +@return DB_SUCCESS if all go well. */ +static +dberr_t +fts_query_read_node( +/*================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: current word */ + que_node_t* exp) /*!< in: query graph node */ +{ + int i; + int ret; + fts_node_t node; + ib_rbt_bound_t parent; + fts_word_freq_t* word_freq; + ibool skip = FALSE; + fts_string_t term; + byte buf[FTS_MAX_WORD_LEN + 1]; + dberr_t error = DB_SUCCESS; + + ut_a(query->cur_node->type == FTS_AST_TERM + || query->cur_node->type == FTS_AST_TEXT + || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST); + + memset(&node, 0, sizeof(node)); + term.f_str = buf; + + /* Need to consider the wildcard search case, the word frequency + is created on the search string not the actual word. So we need + to assign the frequency on search string behalf. */ + if (query->cur_node->type == FTS_AST_TERM + && query->cur_node->term.wildcard) { + + term.f_len = query->cur_node->term.ptr->len; + ut_ad(FTS_MAX_WORD_LEN >= term.f_len); + memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len); + } else { + term.f_len = word->f_len; + ut_ad(FTS_MAX_WORD_LEN >= word->f_len); + memcpy(term.f_str, word->f_str, word->f_len); + } + + /* Lookup the word in our rb tree, it must exist. */ + ret = rbt_search(query->word_freqs, &parent, &term); + + ut_a(ret == 0); + + word_freq = rbt_value(fts_word_freq_t, parent.last); + + /* Start from 1 since the first column has been read by the caller. + Also, we rely on the order of the columns projected, to filter + out ilists that are out of range and we always want to read + the doc_count irrespective of the suitablility of the row. */ + + for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast<byte*>( + dfield_get_data(dfield)); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT. */ + + switch (i) { + case 1: /* DOC_COUNT */ + word_freq->doc_count += mach_read_from_4(data); + break; + + case 2: /* FIRST_DOC_ID */ + node.first_doc_id = fts_read_doc_id(data); + + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && query->upper_doc_id > 0 + && node.first_doc_id > query->upper_doc_id) { + skip = TRUE; + } + break; + + case 3: /* LAST_DOC_ID */ + node.last_doc_id = fts_read_doc_id(data); + + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && query->lower_doc_id > 0 + && node.last_doc_id < query->lower_doc_id) { + skip = TRUE; + } + break; + + case 4: /* ILIST */ + + error = fts_query_filter_doc_ids( + query, &word_freq->word, word_freq, + &node, data, len, FALSE); + + break; + + default: + ut_error; + } + } + + if (!skip) { + /* Make sure all columns were read. */ + + ut_a(i == 5); + } + + return error; +} + +/*****************************************************************//** +Callback function to fetch the rows in an FTS INDEX record. +@return always returns TRUE */ +static +ibool +fts_query_index_fetch_nodes( +/*========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to fts_fetch_t */ +{ + fts_string_t key; + sel_node_t* sel_node = static_cast<sel_node_t*>(row); + fts_fetch_t* fetch = static_cast<fts_fetch_t*>(user_arg); + fts_query_t* query = static_cast<fts_query_t*>(fetch->read_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint dfield_len = dfield_get_len(dfield); + + key.f_str = static_cast<byte*>(data); + key.f_len = dfield_len; + + ut_a(dfield_len <= FTS_MAX_WORD_LEN); + + /* Note: we pass error out by 'query->error' */ + query->error = fts_query_read_node(query, &key, que_node_get_next(exp)); + + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(FALSE); + } else { + return(TRUE); + } +} + +/*****************************************************************//** +Calculate the inverse document frequency (IDF) for all the terms. */ +static +void +fts_query_calculate_idf( +/*====================*/ + fts_query_t* query) /*!< in: Query state */ +{ + const ib_rbt_node_t* node; + ib_uint64_t total_docs = query->total_docs; + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(query->word_freqs); + node; + node = rbt_next(query->word_freqs, node)) { + + fts_word_freq_t* word_freq; + + word_freq = rbt_value(fts_word_freq_t, node); + + if (word_freq->doc_count > 0) { + if (total_docs == word_freq->doc_count) { + /* QP assume ranking > 0 if we find + a match. Since Log10(1) = 0, we cannot + make IDF a zero value if do find a + word in all documents. So let's make + it an arbitrary very small number */ + word_freq->idf = log10(1.0001); + } else { + word_freq->idf = log10( + static_cast<double>(total_docs) + / static_cast<double>( + word_freq->doc_count)); + } + } + } +} + +/*****************************************************************//** +Calculate the ranking of the document. */ +static +void +fts_query_calculate_ranking( +/*========================*/ + const fts_query_t* query, /*!< in: query state */ + fts_ranking_t* ranking) /*!< in: Document to rank */ +{ + ulint pos = 0; + fts_string_t word; + + /* At this stage, ranking->rank should not exceed the 1.0 + bound */ + ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0); + ut_ad(rbt_size(query->word_map) == query->word_vector->size()); + + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + int ret; + ib_rbt_bound_t parent; + double weight; + fts_doc_freq_t* doc_freq; + fts_word_freq_t* word_freq; + + ret = rbt_search(query->word_freqs, &parent, &word); + + /* It must exist. */ + ut_a(ret == 0); + + word_freq = rbt_value(fts_word_freq_t, parent.last); + + ret = rbt_search( + word_freq->doc_freqs, &parent, &ranking->doc_id); + + /* It must exist. */ + ut_a(ret == 0); + + doc_freq = rbt_value(fts_doc_freq_t, parent.last); + + weight = (double) doc_freq->freq * word_freq->idf; + + ranking->rank += (fts_rank_t) (weight * word_freq->idf); + } +} + +/*****************************************************************//** +Add ranking to the result set. */ +static +void +fts_query_add_ranking( +/*==================*/ + fts_query_t* query, /*!< in: query state */ + ib_rbt_t* ranking_tree, /*!< in: ranking tree */ + const fts_ranking_t* new_ranking) /*!< in: ranking of a document */ +{ + ib_rbt_bound_t parent; + + /* Lookup the ranking in our rb tree and add if it doesn't exist. */ + if (rbt_search(ranking_tree, &parent, new_ranking) == 0) { + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + ranking->rank += new_ranking->rank; + + ut_a(ranking->words == NULL); + } else { + rbt_add_node(ranking_tree, &parent, new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } +} + +/*****************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value, 0 if no ranking value +present. */ +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */ +{ + ib_rbt_bound_t parent; + fts_ranking_t new_ranking; + + DBUG_ENTER("fts_retrieve_ranking"); + + if (!result || !result->rankings_by_id) { + DBUG_RETURN(0); + } + + new_ranking.doc_id = doc_id; + + /* Lookup the ranking in our rb tree */ + if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) { + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + DBUG_RETURN(ranking->rank); + } + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Create the result and copy the data to it. */ +static +fts_result_t* +fts_query_prepare_result( +/*=====================*/ + fts_query_t* query, /*!< in: Query state */ + fts_result_t* result) /*!< in: result this can contain + data from a previous search on + another FTS index */ +{ + const ib_rbt_node_t* node; + bool result_is_null = false; + + DBUG_ENTER("fts_query_prepare_result"); + + if (result == NULL) { + result = static_cast<fts_result_t*>( + ut_zalloc_nokey(sizeof(*result))); + + result->rankings_by_id = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE; + result_is_null = true; + } + + if (query->flags == FTS_OPT_RANKING) { + fts_word_freq_t* word_freq; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = + (doc_id_t*) query->deleted->doc_ids->data; + + node = rbt_first(query->word_freqs); + ut_ad(node); + word_freq = rbt_value(fts_word_freq_t, node); + + for (node = rbt_first(word_freq->doc_freqs); + node; + node = rbt_next(word_freq->doc_freqs, node)) { + fts_doc_freq_t* doc_freq; + fts_ranking_t ranking; + + doc_freq = rbt_value(fts_doc_freq_t, node); + + /* Don't put deleted docs into result */ + if (fts_bsearch(updates, 0, static_cast<int>(size), + doc_freq->doc_id) >= 0) { + /* one less matching doc count */ + --word_freq->doc_count; + continue; + } + + ranking.doc_id = doc_freq->doc_id; + ranking.rank = static_cast<fts_rank_t>(doc_freq->freq); + ranking.words = NULL; + + fts_query_add_ranking(query, result->rankings_by_id, + &ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + DBUG_RETURN(NULL); + } + } + + /* Calculate IDF only after we exclude the deleted items */ + fts_query_calculate_idf(query); + + node = rbt_first(query->word_freqs); + word_freq = rbt_value(fts_word_freq_t, node); + + /* Calculate the ranking for each doc */ + for (node = rbt_first(result->rankings_by_id); + node != NULL; + node = rbt_next(result->rankings_by_id, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + ranking->rank = static_cast<fts_rank_t>( + ranking->rank * word_freq->idf * word_freq->idf); + } + + DBUG_RETURN(result); + } + + ut_a(rbt_size(query->doc_ids) > 0); + + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + fts_query_calculate_ranking(query, ranking); + + // FIXME: I think we may requre this information to improve the + // ranking of doc ids which have more word matches from + // different FTS indexes. + + /* We don't need these anymore free the resources. */ + ranking->words = NULL; + + if (!result_is_null) { + fts_query_add_ranking(query, result->rankings_by_id, ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + DBUG_RETURN(NULL); + } + } + } + + if (result_is_null) { + /* Use doc_ids directly */ + rbt_free(result->rankings_by_id); + result->rankings_by_id = query->doc_ids; + query->doc_ids = NULL; + } + + DBUG_RETURN(result); +} + +/*****************************************************************//** +Get the result of the query. Calculate the similarity coefficient. */ +static +fts_result_t* +fts_query_get_result( +/*=================*/ + fts_query_t* query, /*!< in: query instance */ + fts_result_t* result) /*!< in: result */ +{ + DBUG_ENTER("fts_query_get_result"); + + if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) { + /* Copy the doc ids to the result. */ + result = fts_query_prepare_result(query, result); + } else { + /* Create an empty result instance. */ + result = static_cast<fts_result_t*>( + ut_zalloc_nokey(sizeof(*result))); + } + + DBUG_RETURN(result); +} + +/*****************************************************************//** +FTS Query free resources and reset. */ +static +void +fts_query_free( +/*===========*/ + fts_query_t* query) /*!< in: query instance to free*/ +{ + + if (query->read_nodes_graph) { + que_graph_free(query->read_nodes_graph); + } + + if (query->root) { + fts_ast_free_node(query->root); + } + + if (query->deleted) { + fts_doc_ids_free(query->deleted); + } + + if (query->intersection) { + fts_query_free_doc_ids(query, query->intersection); + } + + if (query->doc_ids) { + fts_query_free_doc_ids(query, query->doc_ids); + } + + if (query->word_freqs) { + const ib_rbt_node_t* node; + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(query->word_freqs); + node; + node = rbt_next(query->word_freqs, node)) { + + fts_word_freq_t* word_freq; + + word_freq = rbt_value(fts_word_freq_t, node); + + /* We need to cast away the const. */ + rbt_free(word_freq->doc_freqs); + } + + rbt_free(query->word_freqs); + } + + if (query->wildcard_words != NULL) { + rbt_free(query->wildcard_words); + } + + ut_a(!query->intersection); + + if (query->word_map) { + rbt_free(query->word_map); + } + + if (query->word_vector != NULL) { + UT_DELETE(query->word_vector); + } + + if (query->heap) { + mem_heap_free(query->heap); + } + + memset(query, 0, sizeof(*query)); +} + +/*****************************************************************//** +Parse the query using flex/bison or plugin parser. +@return parse tree node. */ +static +fts_ast_node_t* +fts_query_parse( +/*============*/ + fts_query_t* query, /*!< in: query instance */ + byte* query_str, /*!< in: query string */ + ulint query_len) /*!< in: query string length */ +{ + int error; + fts_ast_state_t state; + bool mode = query->boolean_mode; + DBUG_ENTER("fts_query_parse"); + + memset(&state, 0x0, sizeof(state)); + + state.charset = query->fts_index_table.charset; + + DBUG_EXECUTE_IF("fts_instrument_query_disable_parser", + query->parser = NULL;); + + if (query->parser) { + state.root = state.cur_node = + fts_ast_create_node_list(&state, NULL); + error = fts_parse_by_parser(mode, query_str, query_len, + query->parser, &state); + } else { + /* Setup the scanner to use, this depends on the mode flag. */ + state.lexer = fts_lexer_create(mode, query_str, query_len); + state.charset = query->fts_index_table.charset; + error = fts_parse(&state); + fts_lexer_free(state.lexer); + state.lexer = NULL; + } + + /* Error during parsing ? */ + if (error) { + /* Free the nodes that were allocated during parsing. */ + fts_ast_state_free(&state); + } else { + query->root = state.root; + + if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) { + fts_ast_node_print(query->root); + } + } + + DBUG_RETURN(state.root); +} + +/*******************************************************************//** +FTS Query optimization +Set FTS_OPT_RANKING if it is a simple term query */ +static +void +fts_query_can_optimize( +/*===================*/ + fts_query_t* query, /*!< in/out: query instance */ + uint flags) /*!< In: FTS search mode */ +{ + fts_ast_node_t* node = query->root; + + if (flags & FTS_EXPAND) { + return; + } + + /* Check if it has only a term without oper */ + ut_ad(node->type == FTS_AST_LIST); + node = node->list.head; + if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) { + query->flags = FTS_OPT_RANKING; + } +} + +/** FTS Query entry point. +@param[in,out] trx transaction +@param[in] index fts index to search +@param[in] flags FTS search mode +@param[in] query_str FTS query +@param[in] query_len FTS query string len in bytes +@param[in,out] result result doc ids +@return DB_SUCCESS if successful otherwise error code */ +dberr_t +fts_query( + trx_t* trx, + dict_index_t* index, + uint flags, + const byte* query_str, + ulint query_len, + fts_result_t** result) +{ + fts_query_t query; + dberr_t error = DB_SUCCESS; + byte* lc_query_str; + ulint lc_query_str_len; + ulint result_len; + bool boolean_mode; + trx_t* query_trx; /* FIXME: use provided trx */ + CHARSET_INFO* charset; + ulint start_time_ms; + bool will_be_ignored = false; + + boolean_mode = flags & FTS_BOOL; + + *result = NULL; + memset(&query, 0x0, sizeof(query)); + query_trx = trx_create(); + query_trx->op_info = "FTS query"; + + start_time_ms = ut_time_ms(); + + query.trx = query_trx; + query.index = index; + query.boolean_mode = boolean_mode; + query.deleted = fts_doc_ids_create(); + query.cur_node = NULL; + + query.fts_common_table.type = FTS_COMMON_TABLE; + query.fts_common_table.table_id = index->table->id; + query.fts_common_table.table = index->table; + + charset = fts_index_get_charset(index); + + query.fts_index_table.type = FTS_INDEX_TABLE; + query.fts_index_table.index_id = index->id; + query.fts_index_table.table_id = index->table->id; + query.fts_index_table.charset = charset; + query.fts_index_table.table = index->table; + + query.word_map = rbt_create_arg_cmp( + sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset); + query.word_vector = UT_NEW_NOKEY(word_vector_t()); + query.error = DB_SUCCESS; + + /* Setup the RB tree that will be used to collect per term + statistics. */ + query.word_freqs = rbt_create_arg_cmp( + sizeof(fts_word_freq_t), innobase_fts_text_cmp, + (void*) charset); + + if (flags & FTS_EXPAND) { + query.wildcard_words = rbt_create_arg_cmp( + sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset); + } + + query.total_size += SIZEOF_RBT_CREATE; + + query.total_docs = dict_table_get_n_rows(index->table); + + query.fts_common_table.suffix = "DELETED"; + + /* Read the deleted doc_ids, we need these for filtering. */ + error = fts_table_fetch_doc_ids( + NULL, &query.fts_common_table, query.deleted); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + query.fts_common_table.suffix = "DELETED_CACHE"; + + error = fts_table_fetch_doc_ids( + NULL, &query.fts_common_table, query.deleted); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + /* Get the deleted doc ids that are in the cache. */ + fts_cache_append_deleted_doc_ids( + index->table->fts->cache, query.deleted->doc_ids); + DEBUG_SYNC_C("fts_deleted_doc_ids_append"); + + /* Sort the vector so that we can do a binary search over the ids. */ + ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp); + + /* Convert the query string to lower case before parsing. We own + the ut_malloc'ed result and so remember to free it before return. */ + + lc_query_str_len = query_len * charset->casedn_multiply() + 1; + lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len)); + + /* For binary collations, a case sensitive search is + performed. Hence don't convert to lower case. */ + if (my_binary_compare(charset)) { + memcpy(lc_query_str, query_str, query_len); + lc_query_str[query_len]= 0; + result_len= query_len; + } else { + result_len = innobase_fts_casedn_str( + charset, (char*)( query_str), query_len, + (char*)(lc_query_str), lc_query_str_len); + } + + ut_ad(result_len < lc_query_str_len); + + lc_query_str[result_len] = 0; + + query.heap = mem_heap_create(128); + + /* Create the rb tree for the doc id (current) set. */ + query.doc_ids = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + query.parser = index->parser; + + query.total_size += SIZEOF_RBT_CREATE; + + /* Parse the input query string. */ + if (fts_query_parse(&query, lc_query_str, result_len)) { + fts_ast_node_t* ast = query.root; + ast->trx = trx; + + /* Optimize query to check if it's a single term */ + fts_query_can_optimize(&query, flags); + + DBUG_EXECUTE_IF("fts_instrument_result_cache_limit", + fts_result_cache_limit = 2048; + ); + + /* Traverse the Abstract Syntax Tree (AST) and execute + the query. */ + query.error = fts_ast_visit( + FTS_NONE, ast, fts_query_visitor, + &query, &will_be_ignored); + if (query.error == DB_INTERRUPTED) { + error = DB_INTERRUPTED; + ut_free(lc_query_str); + goto func_exit; + } + + /* If query expansion is requested, extend the search + with first search pass result */ + if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) { + query.error = fts_expand_query(index, &query); + } + + /* Calculate the inverse document frequency of the terms. */ + if (query.error == DB_SUCCESS + && query.flags != FTS_OPT_RANKING) { + fts_query_calculate_idf(&query); + } + + /* Copy the result from the query state, so that we can + return it to the caller. */ + if (query.error == DB_SUCCESS) { + *result = fts_query_get_result(&query, *result); + } + + error = query.error; + } else { + /* still return an empty result set */ + *result = static_cast<fts_result_t*>( + ut_zalloc_nokey(sizeof(**result))); + } + + if (trx_is_interrupted(trx)) { + error = DB_INTERRUPTED; + ut_free(lc_query_str); + if (*result) { + fts_query_free_result(*result); + } + goto func_exit; + } + + ut_free(lc_query_str); + + if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) { + ulint diff_time = ut_time_ms() - start_time_ms; + + ib::info() << "FTS Search Processing time: " + << diff_time / 1000 << " secs: " << diff_time % 1000 + << " millisec: row(s) " + << ((*result)->rankings_by_id + ? lint(rbt_size((*result)->rankings_by_id)) + : -1); + + /* Log memory consumption & result size */ + ib::info() << "Full Search Memory: " << query.total_size + << " (bytes), Row: " + << ((*result)->rankings_by_id + ? rbt_size((*result)->rankings_by_id) + : 0) + << "."; + } + +func_exit: + fts_query_free(&query); + + query_trx->free(); + + return(error); +} + +/*****************************************************************//** +FTS Query free result, returned by fts_query(). */ +void +fts_query_free_result( +/*==================*/ + fts_result_t* result) /*!< in: result instance to free.*/ +{ + if (result) { + if (result->rankings_by_id != NULL) { + rbt_free(result->rankings_by_id); + result->rankings_by_id = NULL; + } + if (result->rankings_by_rank != NULL) { + rbt_free(result->rankings_by_rank); + result->rankings_by_rank = NULL; + } + + ut_free(result); + result = NULL; + } +} + +/*****************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result) /*!< out: result instance to sort.*/ +{ + const ib_rbt_node_t* node; + ib_rbt_t* ranked; + + ut_a(result->rankings_by_id != NULL); + if (result->rankings_by_rank) { + rbt_free(result->rankings_by_rank); + } + + ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank); + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(result->rankings_by_id); + node; + node = rbt_next(result->rankings_by_id, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + ut_a(ranking->words == NULL); + + rbt_insert(ranked, ranking, ranking); + } + + /* Reset the current node too. */ + result->current = NULL; + result->rankings_by_rank = ranked; +} + +/*******************************************************************//** +A debug function to print result doc_id set. */ +static +void +fts_print_doc_id( +/*=============*/ + fts_query_t* query) /*!< in : tree that stores doc_ids.*/ +{ + const ib_rbt_node_t* node; + + /* Iterate each member of the doc_id set */ + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + fts_ranking_t* ranking; + ranking = rbt_value(fts_ranking_t, node); + + ib::info() << "doc_ids info, doc_id: " << ranking->doc_id; + + ulint pos = 0; + fts_string_t word; + + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + ib::info() << "doc_ids info, value: " << word.f_str; + } + } +} + +/*************************************************************//** +This function implements a simple "blind" query expansion search: +words in documents found in the first search pass will be used as +search arguments to search the document again, thus "expand" +the search result set. +@return DB_SUCCESS if success, otherwise the error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_expand_query( +/*=============*/ + dict_index_t* index, /*!< in: FTS index to search */ + fts_query_t* query) /*!< in: FTS query instance */ +{ + const ib_rbt_node_t* node; + const ib_rbt_node_t* token_node; + fts_doc_t result_doc; + dberr_t error = DB_SUCCESS; + const fts_index_cache_t*index_cache; + + /* If no doc is found in first search pass, return */ + if (!rbt_size(query->doc_ids)) { + return(error); + } + + /* Init "result_doc", to hold words from the first search pass */ + fts_doc_init(&result_doc); + + mysql_mutex_lock(&index->table->fts->cache->lock); + index_cache = fts_find_index_cache(index->table->fts->cache, index); + mysql_mutex_unlock(&index->table->fts->cache->lock); + + ut_a(index_cache); + + result_doc.tokens = rbt_create_arg_cmp( + sizeof(fts_token_t), innobase_fts_text_cmp, + (void*) index_cache->charset); + + result_doc.charset = index_cache->charset; + result_doc.parser = index_cache->index->parser; + + query->total_size += SIZEOF_RBT_CREATE; + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + fts_print_doc_id(query); + } + + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + + fts_ranking_t* ranking; + ulint prev_token_size; + ulint estimate_size; + + prev_token_size = rbt_size(result_doc.tokens); + + ranking = rbt_value(fts_ranking_t, node); + + /* Fetch the documents with the doc_id from the + result of first seach pass. Since we do not + store document-to-word mapping, we need to + fetch the original document and parse them. + Future optimization could be done here if we + support some forms of document-to-word mapping */ + fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index, + FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_expansion_fetch_doc, + &result_doc); + + /* Estimate memory used, see fts_process_token and fts_token_t. + We ignore token size here. */ + estimate_size = (rbt_size(result_doc.tokens) - prev_token_size) + * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t) + + sizeof(ib_vector_t) + sizeof(ulint) * 32); + query->total_size += estimate_size; + + if (query->total_size > fts_result_cache_limit) { + error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + goto func_exit; + } + } + + /* Remove words that have already been searched in the first pass */ + for (ulint i = 0; i < query->word_vector->size(); i++) { + fts_string_t word = query->word_vector->at(i); + ib_rbt_bound_t parent; + + if (query->wildcard_words + && rbt_search(query->wildcard_words, &parent, &word) == 0) { + /* If it's a wildcard word, remove words having + it as prefix. */ + while (rbt_search_cmp(result_doc.tokens, + &parent, &word, NULL, + innobase_fts_text_cmp_prefix) + == 0) { + ut_free(rbt_remove_node(result_doc.tokens, + parent.last)); + } + } else { + /* We don't check return value, because the word may + have been deleted by a previous wildcard word as its + prefix, e.g. ('g * good'). */ + rbt_delete(result_doc.tokens, &word); + } + } + + /* Search the table the second time with expanded search list */ + for (token_node = rbt_first(result_doc.tokens); + token_node; + token_node = rbt_next(result_doc.tokens, token_node)) { + fts_token_t* mytoken; + mytoken = rbt_value(fts_token_t, token_node); + + /* '%' in the end is treated as prefix search, + it can cause assert failure, so we skip it. */ + if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') { + continue; + } + + ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0); + fts_query_add_word_freq(query, &mytoken->text); + error = fts_query_union(query, &mytoken->text); + + if (error != DB_SUCCESS) { + break; + } + } + +func_exit: + fts_doc_free(&result_doc); + + return(error); +} +/*************************************************************//** +This function finds documents that contain all words in a +phrase or proximity search. And if proximity search, verify +the words are close enough to each other, as in specified distance. +This function is called for phrase and proximity search. +@return TRUE if documents are found, FALSE if otherwise */ +static +ibool +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance. + query->doc_ids might be instantiated + with qualified doc IDs */ + ib_vector_t* tokens) /*!< in: Tokens contain words */ +{ + ulint n_matched; + ulint i; + ibool matched = FALSE; + ulint num_token = ib_vector_size(tokens); + fts_match_t* match[MAX_PROXIMITY_ITEM]; + ibool end_list = FALSE; + + /* Number of matched documents for the first token */ + n_matched = ib_vector_size(query->match_array[0]); + + /* We have a set of match list for each word, we shall + walk through the list and find common documents that + contain all the matching words. */ + for (i = 0; i < n_matched; i++) { + ulint j; + ulint k = 0; + fts_proximity_t qualified_pos; + + match[0] = static_cast<fts_match_t*>( + ib_vector_get(query->match_array[0], i)); + + /* For remaining match list for the token(word), we + try to see if there is a document with the same + doc id */ + for (j = 1; j < num_token; j++) { + match[j] = static_cast<fts_match_t*>( + ib_vector_get(query->match_array[j], k)); + + while (match[j]->doc_id < match[0]->doc_id + && k < ib_vector_size(query->match_array[j])) { + match[j] = static_cast<fts_match_t*>( + ib_vector_get( + query->match_array[j], k)); + k++; + } + + if (match[j]->doc_id > match[0]->doc_id) { + /* no match */ + if (query->flags & FTS_PHRASE) { + match[0]->doc_id = 0; + } + break; + } + + if (k == ib_vector_size(query->match_array[j])) { + end_list = TRUE; + + if (query->flags & FTS_PHRASE) { + ulint s; + /* Since i is the last doc id in the + match_array[j], remove all doc ids > i + from the match_array[0]. */ + fts_match_t* match_temp; + for (s = i + 1; s < n_matched; s++) { + match_temp = static_cast< + fts_match_t*>(ib_vector_get( + query->match_array[0], s)); + match_temp->doc_id = 0; + } + + if (match[j]->doc_id != + match[0]->doc_id) { + /* no match */ + match[0]->doc_id = 0; + } + } + + if (match[j]->doc_id != match[0]->doc_id) { + goto func_exit; + } + } + + /* FIXME: A better solution will be a counter array + remember each run's last position. So we don't + reset it here very time */ + k = 0; + } + + if (j != num_token) { + continue; + } + + /* For this matching doc, we need to further + verify whether the words in the doc are close + to each other, and within the distance specified + in the proximity search */ + if (query->flags & FTS_PHRASE) { + matched = TRUE; + } else if (fts_proximity_get_positions( + match, num_token, ULINT_MAX, &qualified_pos)) { + + /* Fetch the original documents and count the + words in between matching words to see that is in + specified distance */ + if (fts_query_is_in_proximity_range( + query, match, &qualified_pos)) { + /* If so, mark we find a matching doc */ + query->error = fts_query_process_doc_id( + query, match[0]->doc_id, 0); + if (query->error != DB_SUCCESS) { + matched = FALSE; + goto func_exit; + } + + matched = TRUE; + for (ulint z = 0; z < num_token; z++) { + fts_string_t* token; + token = static_cast<fts_string_t*>( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match[0]->doc_id, token); + } + } + } + + if (end_list) { + break; + } + } + +func_exit: + return(matched); +} + +/*************************************************************//** +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ +static +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos) /*!< out: the position info + records ranges containing + all matching words. */ +{ + ulint i; + ulint idx[MAX_PROXIMITY_ITEM]; + ulint num_pos[MAX_PROXIMITY_ITEM]; + ulint min_idx; + + qualified_pos->n_pos = 0; + + ut_a(num_match <= MAX_PROXIMITY_ITEM); + + /* Each word could appear multiple times in a doc. So + we need to walk through each word's position list, and find + closest distance between different words to see if + they are in the proximity distance. */ + + /* Assume each word's position list is sorted, we + will just do a walk through to all words' lists + similar to a the merge phase of a merge sort */ + for (i = 0; i < num_match; i++) { + /* idx is the current position we are checking + for a particular word */ + idx[i] = 0; + + /* Number of positions for this word */ + num_pos[i] = ib_vector_size(match[i]->positions); + } + + /* Start with the first word */ + min_idx = 0; + + while (idx[min_idx] < num_pos[min_idx]) { + ulint position[MAX_PROXIMITY_ITEM]; + ulint min_pos = ULINT_MAX; + ulint max_pos = 0; + + /* Check positions in each word position list, and + record the max/min position */ + for (i = 0; i < num_match; i++) { + position[i] = *(ulint*) ib_vector_get_const( + match[i]->positions, idx[i]); + + if (position[i] == ULINT_UNDEFINED) { + break; + } + + if (position[i] < min_pos) { + min_pos = position[i]; + min_idx = i; + } + + if (position[i] > max_pos) { + max_pos = position[i]; + } + } + + /* If max and min position are within range, we + find a good match */ + if (max_pos - min_pos <= distance + && (i >= num_match || position[i] != ULINT_UNDEFINED)) { + /* The charset has variable character + length encoding, record the min_pos and + max_pos, we will need to verify the actual + number of characters */ + qualified_pos->min_pos.push_back(min_pos); + qualified_pos->max_pos.push_back(max_pos); + qualified_pos->n_pos++; + } + + /* Otherwise, move to the next position is the + list for the word with the smallest position */ + idx[min_idx]++; + } + + return(qualified_pos->n_pos != 0); +} |