summaryrefslogtreecommitdiffstats
path: root/storage/innobase/include/fts0types.h
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 12:24:36 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 12:24:36 +0000
commit06eaf7232e9a920468c0f8d74dcf2fe8b555501c (patch)
treee2c7b5777f728320e5b5542b6213fd3591ba51e2 /storage/innobase/include/fts0types.h
parentInitial commit. (diff)
downloadmariadb-06eaf7232e9a920468c0f8d74dcf2fe8b555501c.tar.xz
mariadb-06eaf7232e9a920468c0f8d74dcf2fe8b555501c.zip
Adding upstream version 1:10.11.6.upstream/1%10.11.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'storage/innobase/include/fts0types.h')
-rw-r--r--storage/innobase/include/fts0types.h354
1 files changed, 354 insertions, 0 deletions
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000..fb278d54
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,354 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "fts0fts.h"
+#include "pars0pars.h"
+#include "que0types.h"
+#include "ut0byte.h"
+#include "ut0rbt.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+ doc_id_t doc_id; /*!< Document id */
+ ulint word_count; /*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+ fts_index_cache_t*
+ index_cache; /*!< The index cache instance */
+
+ /*!< Parsed sql statement */
+ que_t* get_document_graph;
+ fts_cache_t* cache; /*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+ dict_index_t* index; /*!< The FTS index instance */
+
+ ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*,
+ cells are fts_tokenizer_word_t*.*/
+
+ ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t
+ contained in the memory buffer.
+ Must be in sorted order (ascending).
+ The ideal choice is an rb tree but
+ the rb tree imposes a space overhead
+ that we can do without */
+
+ que_t** ins_graph; /*!< Insert query graphs */
+
+ que_t** sel_graph; /*!< Select query graphs */
+ CHARSET_INFO* charset; /*!< charset */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+ ulint status; /*!< Status of the stopword tree */
+ ib_alloc_t* heap; /*!< The memory allocator to use */
+ ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */
+ CHARSET_INFO* charset; /*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+ trx_t* trx; /*!< The transaction used for SYNCing
+ the cache to disk */
+ dict_table_t* table; /*!< Table with FTS index(es) */
+ ulint max_cache_size; /*!< Max size in bytes of the cache */
+ ibool cache_full; /*!< flag, when true it indicates that
+ we need to sync the cache to disk */
+ ulint lower_index; /*!< the start index of the doc id
+ vector from where to start adding
+ documents to the FTS cache */
+ ulint upper_index; /*!< max index of the doc id vector to
+ add to the FTS cache */
+ ibool interrupted; /*!< TRUE if SYNC was interrupted */
+ doc_id_t min_doc_id; /*!< The smallest doc id added to the
+ cache. It should equal to
+ doc_ids[lower_index] */
+ doc_id_t max_doc_id; /*!< The doc id at which the cache was
+ noted as being full, we use this to
+ set the upper_limit field */
+ time_t start_time; /*!< SYNC start time; only used if
+ fts_enable_diag_print */
+ bool in_progress; /*!< flag whether sync is in progress.*/
+ bool unlock_cache; /*!< flag whether unlock cache when
+ write fts node */
+ /** condition variable for in_progress; used with table->fts->cache->lock */
+ pthread_cond_t cond;
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t
+{
+ /** lock protecting all access to the memory buffer */
+ mysql_mutex_t lock;
+ /** cache initialization */
+ mysql_mutex_t init_lock;
+
+ /** protection for deleted_doc_ids */
+ mysql_mutex_t deleted_lock;
+
+ /** protection for DOC_ID */
+ mysql_mutex_t doc_id_lock;
+
+ ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each
+ element is of type fts_update_t */
+
+ ib_vector_t* indexes; /*!< We store the stats and inverted
+ index for the individual FTS indexes
+ in this vector. Each element is
+ an instance of fts_index_cache_t */
+
+ ib_vector_t* get_docs; /*!< information required to read
+ the document from the table. Each
+ element is of type fts_doc_t */
+
+ size_t total_size; /*!< total size consumed by the ilist
+ field of all nodes. SYNC is run
+ whenever this gets too big */
+ /** total_size at the time of the previous SYNC request */
+ size_t total_size_at_sync;
+
+ fts_sync_t* sync; /*!< sync structure to sync data to
+ disk */
+ ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes
+ and deleted_doc_ids, ie. transient
+ objects, they are recreated after
+ a SYNC is completed */
+
+ ib_alloc_t* self_heap; /*!< This heap is the heap out of
+ which an instance of the cache itself
+ was created. Objects created using
+ this heap will last for the lifetime
+ of the cache */
+
+ doc_id_t next_doc_id; /*!< Next doc id */
+
+ doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */
+
+ doc_id_t first_doc_id; /*!< first doc id since this table
+ was opened */
+
+ ulint deleted; /*!< Number of doc ids deleted since
+ last optimized. This variable is
+ covered by deleted_lock */
+
+ ulint added; /*!< Number of doc ids added since last
+ optimized. This variable is covered by
+ the deleted lock */
+
+ fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */
+ mem_heap_t* cache_heap; /*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+ doc_id_t first_doc_id; /*!< First document id in ilist. */
+
+ doc_id_t last_doc_id; /*!< Last document id in ilist. */
+
+ byte* ilist; /*!< Binary list of documents & word
+ positions the token appears in.
+ TODO: For now, these are simply
+ ut_malloc'd, but if testing shows
+ that they waste memory unacceptably, a
+ special memory allocator will have
+ to be written */
+
+ ulint doc_count; /*!< Number of doc ids in ilist */
+
+ ulint ilist_size; /*!< Used size of ilist in bytes. */
+
+ ulint ilist_size_alloc;
+ /*!< Allocated size of ilist in
+ bytes */
+ bool synced; /*!< flag whether the node is synced */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+ fts_string_t text; /*!< Token text. */
+
+ ib_vector_t* nodes; /*!< Word node ilists, each element is
+ of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+ fts_string_t text; /*!< Word value in UTF-8 */
+ ib_vector_t* nodes; /*!< Nodes read from disk */
+
+ ib_alloc_t* heap_alloc; /*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+ void* read_arg; /*!< Arg for the sql_callback */
+
+ fts_sql_callback
+ read_record; /*!< Callback for reading index
+ record */
+ size_t total_memory; /*!< Total memory used */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+ ulint value; /*!< Character value at which
+ to split */
+
+ const char* suffix; /*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+ fts_string_t text; /*!< document text */
+
+ ibool found; /*!< TRUE if the document was found
+ successfully in the database */
+
+ ib_rbt_t* tokens; /*!< This is filled when the document
+ is tokenized. Tokens; indexed by
+ fts_string_t*, cells are of type
+ fts_token_t* */
+
+ ib_alloc_t* self_heap; /*!< An instance of this type is
+ allocated from this heap along
+ with any objects that have the
+ same lifespan, most notably
+ the vector of token positions */
+ CHARSET_INFO* charset; /*!< Document's charset info */
+
+ st_mysql_ftparser* parser; /*!< fts plugin parser */
+
+ ib_rbt_t* stopwords; /*!< Stopwords */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+ fts_string_t text; /*!< token text */
+
+ ib_vector_t* positions; /*!< an array of the positions the
+ token is found in; each item is
+ actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Compare two doc_ids. */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ const void* p1, /*!< in: id1 */
+ const void* p2); /*!< in: id2 */
+
+/******************************************************************//**
+Duplicate a string. */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+ /*!< out:
+ < 0 if n1 < n2,
+ 0 if n1 == n2,
+ > 0 if n1 > n2 */
+ fts_string_t* dst, /*!< in: dup to here */
+ const fts_string_t* src, /*!< in: src string */
+ mem_heap_t* heap); /*!< in: heap to use */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+ ulint selected); /*!< in: selected index */
+
+/** Select the FTS auxiliary index for the given character.
+@param[in] cs charset
+@param[in] str string
+@param[in] len string length in bytes
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+ const CHARSET_INFO* cs,
+ const byte* str,
+ ulint len);
+
+#include "fts0types.inl"
+
+#endif /* INNOBASE_FTS0TYPES_H */